Source code for rfactor.rain

import re
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm



[docs]
class RainfallFilesIOMsg(str):
    """Print message a string"""

    def __repr__(self):
        return str(self)



def _check_path(file_path):
    """Provide user feedback on file_path type."""
    if not isinstance(file_path, Path):
        if isinstance(file_path, str):
            raise TypeError(
                f"'file_path' should be a 'pathlib.Path' object, use "
                f"'Path({file_path})' to convert string file_path to valid 'Path'."
            )
        else:
            raise TypeError("'file_path' should be a pathlib.Path object")



[docs]
def load_rain_file(file_path, load_fun, **kwargs):
    """Load file format of rainfall data with a given load function

    Parameters
    ----------
    file_path : pathlib.Path
        File path with rainfall data. Note that files in the folder should follow the
        input data format defined in the ``load_fun``.

    load_fun : Callable
        Please check the required input/output format for the files of the used load
        functions. The output of this function must comply with:

            - *datetime* (datetime64[ns]): timestamp, timezone naive
            - *station* (object): name of station, must be formatting accoring to a
              string.
            - *value* (float): in mm

    kwargs:
        Keyword arguments for load_fun


    Returns
    -------
    rain : pandas.DataFrame
        DataFrame with rainfall time series. Contains at least the following columns:

        - *rain_mm* (float): Rain in mm
        - *datetime* (pandas.Timestamp): Time stamp
        - *minutes_since* (float): Minutes since start of year.
        - *station* (str): station name
        - *year* (int): year of the measurement
        - *tag* (str): tag identifier, formatted as ``STATION_YEAR``
    """
    rain = load_fun(file_path, **kwargs)

    if not isinstance(rain, pd.core.frame.DataFrame):
        msg = f"Load function must '{load_fun.__name__}' return pandas.DataFrame"
        raise IOError(RainfallFilesIOMsg(msg))

    if not {"datetime", "station", "rain_mm"}.issubset(rain.columns):
        msg = (
            f"Load function '{load_fun.__name__}' must return columns 'datetime', "
            f"'station' and 'rain_mm'."
        )
        raise IOError(RainfallFilesIOMsg(msg))
    if not pd.api.types.is_datetime64_ns_dtype(rain["datetime"]):
        msg = (
            f"Load function '{load_fun.__name__}' must return datetime64[ns] type for "
            f"column 'datetime'."
        )
        raise IOError(RainfallFilesIOMsg(msg))
    if not pd.api.types.is_object_dtype(rain["station"]):
        msg = (
            f"Load function '{load_fun.__name__}' must return object (str) type for "
            f"column 'station'."
        )
        raise IOError(RainfallFilesIOMsg(msg))
    if not pd.api.types.is_float_dtype(rain["rain_mm"]):
        msg = (
            f"Load function '{load_fun.__name__}' must return float for column "
            f"'rain_mm'."
        )
        raise IOError(RainfallFilesIOMsg(msg))
    rain["year"] = rain["datetime"].dt.year
    rain["tag"] = rain["station"].astype(str) + "_" + rain["year"].astype(str)

    return rain




[docs]
def load_rain_folder(folder_path, load_fun, **kwargs):
    """Load all (legacy Matlab format) files of rainfall data in a folder

    Parameters
    ----------
    folder_path : pathlib.Path
        Folder path with rainfall data, see also :func:`rfactor.process.load_rain_file`.
        Folder must contain txt files.

    load_fun : Callable
        Please check the required input format for the files in the above listed
        functions. The (custom) function must output:

        - *datetime* (datetime64[ns]): timestamp, timezone naive
        - *station* (object): name of station, must be formatting accoring to a string.
        - *value* (float): in mm

    kwargs:
        Keyword arguments for load_fun

    Returns
    -------
    rain : pandas.DataFrame
        See definition in :func:`rfactor.process.load_rain_file`
    """
    _check_path(folder_path)
    if not folder_path.exists():
        msg = f"Input folder '{folder_path}' does not exists."
        raise FileNotFoundError(msg)
    if folder_path.is_file():
        raise ValueError(
            "`folder_path` need to be the path " "to a directory instead of a file"
        )

    lst_df = []
    files = list(folder_path.glob("*.txt"))

    if len(files) == 0:
        msg = f"Input folder '{folder_path}' does not contain any 'txt'-files."
        raise FileNotFoundError(msg)

    for file_path in tqdm(files, desc="Processing input files"):
        df = load_rain_file(file_path, load_fun, **kwargs)
        lst_df.append(df)
    all_rain = pd.concat(lst_df)
    all_rain = all_rain.sort_values(["station", "datetime"])
    all_rain.index = range(len(all_rain))

    return all_rain



def _extract_metadata_from_file_path(file_path):
    """Get metadata from file name

    Expects to be 'STATION_NAME_YYYY.txt' as format with ``STATION_NAME`` the
    measurement station and the ``YYYY`` as the year of the measurement.

    Parameters
    ----------
    file_path : pathlib.Path
        File path of the file to extract station/year from

    Returns
    -------
    station: str
    year : str
    """
    if not re.fullmatch(".*_[0-9]{4}$", file_path.stem):
        raise ValueError(
            "Input file_path_format should " "match with 'STATION_NAME_YYYY.txt'"
        )
    station = "_".join(file_path.stem.split("_")[:-1])
    year = file_path.stem.split("_")[-1]
    return station, year



[docs]
def load_rain_file_matlab_legacy(file_path):
    """Load (legacy Matlab) file format of rainfall data of a **single station/year**.

    The input files are defined by text files (extension: ``.txt``) that hold
    non-zero rainfall timeseries. The data are split per station and per year with
    a specific datafile tag (file name format: ``SOURCE_STATION_YEAR.txt``). The data
    should not contain headers, with the first column defined as 'minutes since the
    start of the year' and the second as the rainfall depth during the t last minutes
    (t is the temporal resolution of the timeseries).

    Parameters
    ----------
    file_path : pathlib.Path
        File path with rainfall data according to defined format, see notes.

    Returns
    -------
    rain : pandas.DataFrame
        DataFrame with rainfall time series. Contains the following columns:

        - *minutes_since* (int): Minutes since the start of the year
        - *rain_mm* (float): Rain in mm
        - *datetime* (pandas.Timestamp): Time stamp
        - *station* (str): station name

    Example
    -------
    1. Example of a rainfall file:

    ::

       9390 1.00 \n
       9470 0.20 \n
       9480 0.50 \n
       10770 0.10 \n
       ...  ...
    """
    if file_path.is_dir():
        raise ValueError(
            "`file_path` need to be the path " "to a file instead of a directory"
        )

    station, year = _extract_metadata_from_file_path(file_path)
    rain = pd.read_csv(
        file_path, delimiter=" ", header=None, names=["minutes_since", "rain_mm"]
    )
    if np.sum(rain["minutes_since"].isnull()) > 0:
        msg = (
            "Timestamp (i.e. minutes from start of year) column contains "
            "NaN-values. Input should be a (space-delimited) text file with the "
            "first column being the timestamp from the start of the year (minutes),"
            " and second the rainfall depth (in mm, non-zero series): \n \n9390 "
            "1.00\n9470 0.20\n9480 0.50\n... ..."
        )
        raise IOError(RainfallFilesIOMsg(msg))
    rain = rain.assign(
        datetime=pd.Timestamp(f"{year}-01-01")
        + pd.to_timedelta(pd.to_numeric(rain["minutes_since"]), unit="min")
    )

    rain = rain.assign(station=station)
    return rain[["datetime", "station", "rain_mm"]]




[docs]
def load_rain_file_flanders(
    file_path, interpolate=None, interval=np.inf, threshold_outliers=None
):
    """Example load functions developed in context of Flanders.

    Translated the input file_path to the default input data used this package. This
    functions can be used for users an example to format functions. The file is a
    tab delimited files (extension: ``.txt``), and holds the timeseries for one
    location. The name of the file is the tag that will be used.

    Parameters
    ----------
    file_path: pathlib.Path
        File path (tab delimited, .txt-extension). Headerless

        - ``%d-%m-%Y %H:%M:%S``-format
        - float

    interpolate: str, default None
        Interpolation method to use for NaN-Values. Possible values:
        see pandas.DataFrame.interpolate.

    interval: int, default np.inf
        The max interval length over which NaN values are interpolated. The value
        needs to fit the index of the timeseries. For example, a timeseries with
        resolution of 10 min will have a maximum interval length of 6 hours if the
        interval value is set to 36 (36 * 10 min = 6 hours).

    threshold_outliers: int, default None
        Set rainfall values above this threshold to NaN.


    Returns
    -------
    rain : pandas.DataFrame
        DataFrame with rainfall time series. Contains the following columns:

        - *datetime* (pandas.Timestamp): Time stamp.
        - *minutes_since* (float): Minutes since start of year.
        - *station* (str): station identifier.
        - *rain_mm* (float): Rain in mm.

    Example
    -------
    Example of a rainfall file:

    ::
        2024-01-01 00:00:00	0.0
        2024-01-01 00:10:00	0.0
        2024-01-01 00:20:00	0.0
        2024-01-01 00:30:00	10.5
        2024-01-01 00:40:00	5.2
        2024-01-01 00:50:00	1
        2024-01-01 01:00:00	0.02
        2024-01-01 01:10:00

    """
    df = pd.read_csv(
        file_path,
        sep="\t",
        header=None,
        names=["datetime", "rain_mm"],
        na_values=["---", ""],
    )

    if not {"datetime", "rain_mm"}.issubset(df.columns):
        msg = (
            f"File '{file_path}' should should contain columns 'datetime' and "
            f"'Value [millimeter]'."
        )
        raise KeyError(msg)

    df["datetime"] = pd.to_datetime(df["datetime"])
    station, year = _extract_metadata_from_file_path(file_path)
    df["station"] = station

    # Sanitize rain outliers (negative values and values beyond threshold)
    df.loc[df["rain_mm"] < 0, "rain_mm"] = np.nan
    if threshold_outliers:
        df.loc[df["rain_mm"] > threshold_outliers, "rain_mm"] = np.nan

    # Short-period Nan values are interpolated
    if interpolate:
        # Remove consecutive NaN-values that are longer than the interval
        is_nan = df["rain_mm"].isna()
        group = (is_nan != is_nan.shift()).cumsum()
        df_temp = df[is_nan].groupby(group[is_nan]).transform("size") > interval
        indices_to_remove = df_temp[df_temp].index
        df = df.drop(index=indices_to_remove)
        # Interpolate the remaining NaN-values
        df["rain_mm"] = df["rain_mm"].interpolate(method=interpolate)

    # remove 0 values
    df = df[df["rain_mm"] > 0]
    # remove NaN-values
    df = df.dropna(subset=["rain_mm"])
    df["rain_mm"] = df["rain_mm"].astype(np.float64)

    return df[["datetime", "station", "rain_mm"]]