Source code for aoiro.reader._io

import re
import warnings
from collections.abc import Iterable
from decimal import Decimal
from pathlib import Path
from typing import Any

import pandas as pd
from dateparser import parse

from .._ledger import GeneralLedgerLineImpl, LedgerElementImpl


[docs] def read_all_csvs(path: Path, /, **kwargs: Any) -> pd.DataFrame: """ Read all CSV files in the path. Parameters ---------- path : Path The path to the directory containing CSV files. **kwargs : Any The keyword arguments for `pd.read_csv`. Returns ------- pd.DataFrame The concatenated DataFrame with column "path" containing the relative path of the CSV file added. """ dfs = [] for p in path.rglob("*.csv"): df = pd.read_csv(p, **kwargs) df["path"] = p.relative_to(path).as_posix() dfs.append(df) if not dfs: return pd.DataFrame(columns=["path"]) return pd.concat(dfs)
def parse_date(s: str) -> pd.Timestamp: """ Parse date. Prefer the last day of the month if the day is not provided. Parameters ---------- s : str The string to parse. Returns ------- pd.Timestamp The parsed date. """ return pd.Timestamp(parse(s, settings={"PREFER_DAY_OF_MONTH": "last"})) def parse_money( s: str, currency: str | None = None ) -> tuple[Decimal | None, str | None]: """ Parse money. Parameters ---------- s : str The string to parse. currency : str | None, optional The currency, by default None. If provided, the currency in the string would be ignored and replaced by this. Returns ------- tuple[Decimal | None, str | None] The amount and the currency. """ match = re.search(r"-?[\d.]+", s) if match is None: return None, None amount = Decimal(match.group()) if currency is None: currency = re.sub(r"\s+", "", s[: match.start()] + s[match.end() :]) return amount, currency
[docs] def read_simple_csvs(path: Path) -> pd.DataFrame: """ Read all CSV files in the path. The CSV files are assumed to have columns ["発生日", "金額"]. Parameters ---------- path : Path The path to the directory containing CSV files. Returns ------- pd.DataFrame The concatenated DataFrame with columns ["発生日", "金額", "通貨", "path"]. """ df = read_all_csvs(path, dtype=str) if df.empty: return df for col in ["発生日", "金額"]: if col not in df.columns: df[col] = None # parse date for k in df.columns: if "日" not in k: continue df[k] = df[k].map(parse_date) # parse money df[["金額", "通貨"]] = pd.DataFrame( df["金額"].map(parse_money).tolist(), index=df.index ) # set date as index df.set_index("発生日", inplace=True, drop=False) return df
[docs] def read_general_ledger(path: Path) -> Iterable[GeneralLedgerLineImpl[Any, Any]]: """ Read general ledger. The first column is assumed to be the date. For all n in N. the 2n-1-th column is assumed to be the account name, and the 2n-th column is assumed to be the amount. Parameters ---------- path : Path The path to the CSV file. Returns ------- Iterable[GeneralLedgerLineImpl[Any, Any]] The general ledger. """ df = read_all_csvs(path / "general", header=None, dtype=str) df.drop(columns="path", inplace=True) if df.empty: return if len(df.columns) % 2 != 1: raise ValueError("The number of columns should be odd.") if len(df.columns) < 3: raise ValueError("The number of columns should be at least 3.") for _, row in df.iterrows(): values: list[LedgerElementImpl[Any, Any]] = [] for i in range(1, len(row), 2): amount, currency = parse_money(row[i + 1]) if amount is None: warnings.warn(f"Amount not found in {row[i + 1]}", stacklevel=2) continue values.append( LedgerElementImpl( account=row[i], amount=amount, currency=currency, ) ) yield GeneralLedgerLineImpl( values=values, date=parse_date(row[0]), )