Source code for config_versioned.autoread

"""Auto-read files based on their extension."""

import os
from pathlib import Path

from config_versioned.utilities import pull_from_config


def _require_or_raise(pkg_name, install_extra=None):
    """Import a package, raising a helpful ImportError if it is not installed."""
    import importlib
    try:
        return importlib.import_module(pkg_name)
    except ImportError:
        hint = (
            f" Install with: pip install versioning[{install_extra}]"
            if install_extra
            else f" Install with: pip install {pkg_name}"
        )
        raise ImportError(
            f"Package '{pkg_name}' is required to read this file type.{hint}"
        )



[docs]
def get_file_reading_functions():
    """Return a dict mapping file extensions to reading functions.

    Returns
    -------
    dict
        Keys are lowercase file extensions (without the dot). Values are
        callables with signature ``f(file, **kwargs)`` that read the file
        and return the loaded object.
    """
    def read_csv(file, **kwargs):
        pd = _require_or_raise("pandas", "pandas")
        return pd.read_csv(file, **kwargs)

    def read_tsv(file, **kwargs):
        pd = _require_or_raise("pandas", "pandas")
        kwargs.setdefault("sep", "\t")
        return pd.read_csv(file, **kwargs)

    def read_dbf(file, **kwargs):
        dbfread = _require_or_raise("dbfread", "dbfread")
        try:
            pd = _require_or_raise("pandas", "pandas")
            return pd.DataFrame(iter(dbfread.DBF(str(file), **kwargs)))
        except ImportError:
            return list(dbfread.DBF(str(file), **kwargs))

    def read_dta(file, **kwargs):
        pd = _require_or_raise("pandas", "pandas")
        return pd.read_stata(file, **kwargs)

    def read_geo(file, **kwargs):
        gpd = _require_or_raise("geopandas", "geo")
        return gpd.read_file(file, **kwargs)

    def read_tif(file, **kwargs):
        rasterio = _require_or_raise("rasterio", "raster")
        import numpy as np  # bundled with rasterio
        with rasterio.open(str(file), **kwargs) as src:
            data = src.read()
            profile = src.profile.copy()
        return {"data": data, "profile": profile}

    def read_txt(file, **kwargs):
        with open(file, **kwargs) as f:
            return f.readlines()

    def read_excel(file, **kwargs):
        pd = _require_or_raise("pandas", "pandas")
        return pd.read_excel(file, **kwargs)

    def read_yaml(file, **kwargs):
        import yaml
        with open(file, **kwargs) as f:
            return yaml.safe_load(f)

    def read_nc(file, **kwargs):
        xr = _require_or_raise("xarray", "xarray")
        return xr.open_dataset(file, **kwargs)

    funs = {
        "csv": read_csv,
        "tsv": read_tsv,
        "gz": read_csv,
        "bz2": read_csv,
        "dbf": read_dbf,
        "dta": read_dta,
        "shp": read_geo,
        "tif": read_tif,
        "geotiff": read_tif,
        "txt": read_txt,
        "xls": read_excel,
        "xlsx": read_excel,
        "yaml": read_yaml,
        "yml": read_yaml,
        "nc": read_nc,
    }

    # Additional geospatial vector formats (via geopandas/GDAL)
    geo_exts = [
        "e00", "fgb", "gdb", "geojson", "geojsonseq", "gml", "gpkg", "gps",
        "gpx", "gtm", "gxt", "jml", "kml", "map", "mdb", "ods", "osm", "pbf",
        "sqlite", "vdv",
    ]
    for ext in geo_exts:
        funs[ext] = read_geo

    return funs




[docs]
def autoread(file, **kwargs):
    """Automatically read a file based on its extension.

    Parameters
    ----------
    file : str or Path
        Full path to the file to read. Tilde expansion is applied.
    **kwargs
        Additional keyword arguments passed to the format-specific reader.

    Returns
    -------
    The object loaded from the file. Return type depends on the format:
    - csv/tsv/xlsx/dta: pandas DataFrame
    - yaml/yml: dict
    - txt: list of str
    - shp/geojson/etc.: geopandas GeoDataFrame
    - tif/geotiff: dict with keys "data" (numpy ndarray) and "profile" (dict)
    - nc: xarray Dataset
    - dbf: pandas DataFrame (or list of dicts if pandas not installed)

    Raises
    ------
    FileNotFoundError
        If the file does not exist.
    IsADirectoryError
        If the path points to a directory.
    ValueError
        If the file has no extension or the extension is not supported.
    """
    file = Path(os.path.expanduser(str(file)))
    if not file.exists():
        raise FileNotFoundError(f"Input file '{file}' does not exist.")
    if file.is_dir():
        raise IsADirectoryError(f"Input path '{file}' is a directory, not a file.")
    ext = file.suffix.lstrip(".").lower()
    if not ext:
        raise ValueError(f"File '{file}' has no extension.")
    reading_fns = get_file_reading_functions()
    if ext not in reading_fns:
        raise ValueError(
            f"Unsupported file extension '.{ext}'. "
            f"Supported extensions: {sorted(reading_fns.keys())}"
        )
    return reading_fns[ext](file, **kwargs)