Source code for erddapy.erddapy

"""Pythonic way to access ERDDAP data."""

from __future__ import annotations

import functools
import hashlib
from pathlib import Path
from typing import TYPE_CHECKING
from urllib.request import urlretrieve

import pandas as pd

from erddapy.core.griddap import (
    _griddap_check_constraints,
    _griddap_check_variables,
    _griddap_get_constraints,
)
from erddapy.core.interfaces import to_iris, to_ncCF, to_pandas, to_xarray
from erddapy.core.url import (
    _check_substrings,
    _distinct,
    _format_constraints_url,
    _quote_string_constraints,
    _sort_url,
    download_formats,
    get_categorize_url,
    get_download_url,
    get_info_url,
    get_search_url,
    parse_dates,
    urlopen,
)
from erddapy.servers.servers import servers

# Objects used by downstream packages
__all__ = [
    "ERDDAP",
    "_check_substrings",
    "_distinct",
    "_format_constraints_url",
    "_quote_string_constraints",
    "parse_dates",
    "urlopen",
]

if TYPE_CHECKING:
    import iris.cube
    import netCDF4.Dataset
    import xarray as xr

OptionalBool = bool | None
OptionalDict = dict | None
OptionalList = list[str] | tuple[str] | None
OptionalStr = str | None


[docs] class ERDDAP: """Creates an ERDDAP instance for a specific server endpoint. Args: ---- server: an ERDDAP server URL or acronym is using the builtin servers. protocol: tabledap or griddap. Attributes: ---------- dataset_id: a dataset unique id. variables: a list variables to download. response: default is HTML. constraints: download constraints, default None (opendap-like url) params and requests_kwargs: `httpx.get` options Returns: ------- instance: the ERDDAP URL builder. Examples: -------- Specifying the server URL >>> e = ERDDAP(server="https://gliders.ioos.us/erddap") let's search for glider `ru29` and read the csv response with pandas. >>> import pandas as pd >>> url = e.get_search_url(search_for="ru29", response="csv") >>> pd.read_csv(url)["Dataset ID"] 0 ru29-20150623T1046 1 ru29-20161105T0131 Name: Dataset ID, dtype: object there are "shortcuts" for some servers >>> e = ERDDAP(server="SECOORA") >>> e.server 'http://erddap.secoora.org/erddap' to get a list of the shortcuts available servers: >>> from erddapy import servers >>> {k: v.url for k, v in servers.items()} {'MDA': 'https://bluehub.jrc.ec.europa.eu/erddap/', 'MII': 'https://erddap.marine.ie/erddap/', 'CSCGOM': 'http://cwcgom.aoml.noaa.gov/erddap/', 'CSWC': 'https://coastwatch.pfeg.noaa.gov/erddap/', 'CeNCOOS': 'http://erddap.axiomalaska.com/erddap/', 'NERACOOS': 'http://www.neracoos.org/erddap/', 'NGDAC': 'https://gliders.ioos.us/erddap/', 'PacIOOS': 'http://oos.soest.hawaii.edu/erddap/', 'SECOORA': 'http://erddap.secoora.org/erddap/', 'NCEI': 'https://ecowatch.ncddc.noaa.gov/erddap/', 'OSMC': 'http://osmc.noaa.gov/erddap/', 'UAF': 'https://upwell.pfeg.noaa.gov/erddap/', 'ONC': 'http://dap.onc.uvic.ca/erddap/', 'BMLSC': 'http://bmlsc.ucdavis.edu:8080/erddap/', 'RTECH': 'https://meteo.rtech.fr/erddap/', 'IFREMER': 'http://www.ifremer.fr/erddap/', 'UBC': 'https://salishsea.eos.ubc.ca/erddap/'} """ def __init__( self: ERDDAP, server: str, protocol: OptionalStr = None, response: str = "html", ) -> None: """Instantiate main class attributes. Attributes ---------- server: the server URL. protocol: ERDDAP's protocol (tabledap/griddap) response: default is HTML. """ if server.lower() in servers: server = servers[server.lower()].url self.server = server.rstrip("/") self.protocol = protocol self.response = response # Initialized only via properties. self.server_functions: dict | None = None self.requests_kwargs: dict = {} self.auth: tuple | None = None self.constraints: dict | None = None self.variables: OptionalList | None = None self.dim_names: OptionalList | None = None self._get_variables = functools.lru_cache(maxsize=128)( self._get_variables_uncached, ) # Caching the last dataset_id and variables list request for # quicker access, will be overridden when requesting a new dataset_id. self._dataset_id: OptionalStr = None self._variables: dict = {} @property def dataset_id(self) -> str: """dataset_id property.""" return self._dataset_id @dataset_id.setter def dataset_id(self, value: str) -> None: self._dataset_id = value self.griddap_initialize(dataset_id=value)
[docs] def griddap_initialize( self: ERDDAP, dataset_id: OptionalStr = None, step: int = 1, ) -> None: """Fetch metadata of dataset and initialize constraints and variables. Args: ---- dataset_id: a dataset unique id. step: step used to subset dataset """ dataset_id = dataset_id if dataset_id else self.dataset_id # Short-circuit for opendap and/or non-griddap datasets. if self.protocol != "griddap" or self.response == "opendap": return msg = f"Must set a valid dataset_id, got {self.dataset_id}" if dataset_id is None: raise ValueError(msg) metadata_url = f"{self.server}/griddap/{self.dataset_id}" ( self.constraints, self.dim_names, self.variables, ) = _griddap_get_constraints(metadata_url, step) self._constraints_original = self.constraints.copy() self._variables_original = self.variables.copy()
[docs] def get_search_url( self: ERDDAP, response: OptionalStr = None, search_for: OptionalStr = None, protocol: OptionalStr = None, items_per_page: int = 1_000_000, page: int = 1, **kwargs: dict, ) -> str: """Build the search URL for the `server` endpoint provided. Args: ---- search_for: "Google-like" search of the datasets' metadata. - Type the words you want to search for, with spaces between the words. ERDDAP will search for the words separately, not as a phrase. - To search for a phrase, put double quotes around the phrase (for example, `"wind speed"`). - To exclude datasets with a specific word use `-excludedWord`. - To exclude datasets with a specific phrase, use `-"excluded phrase"` - Searches are not case-sensitive. - You can search for any part of a word. For example, searching for `spee` will find datasets with `speed` and datasets with `WindSpeed` - The last word in a phrase may be a partial word. For example, to find datasets from a specific website (usually the start of the datasetID), include (for example) `"datasetID=erd"` in your search. response: default is HTML. protocol: tabledap or griddap. items_per_page: how many items per page in the return, default is 1_000_000 for HTML, 1e6 (hopefully all items) for CSV, JSON. page: which page to display, default is the first page (1). kwargs: extra search constraints based on metadata and/or coordinates key/value. metadata: `cdm_data_type`, `institution`, `ioos_category`, `keywords`, `long_name`, `standard_name`, and `variableName`. coordinates: `minLon`, `maxLon`, `minLat`, `maxLat`, `minTime`, and `maxTime`. Returns: ------- url: the search URL. """ protocol = protocol if protocol else self.protocol response = response if response else self.response return get_search_url( self.server, response=response, search_for=search_for, protocol=protocol, items_per_page=items_per_page, page=page, **kwargs, )
[docs] def get_info_url( self: ERDDAP, dataset_id: OptionalStr = None, response: OptionalStr = None, ) -> str: """Build the info URL for the `server` endpoint. Args: ---- dataset_id: a dataset unique id. If empty the full dataset listing will be returned. response: default is HTML. Returns: ------- url: the info URL for the `response` chosen. """ dataset_id = dataset_id if dataset_id else self.dataset_id response = response if response else self.response return get_info_url( self.server, dataset_id=dataset_id, response=response, )
[docs] def get_categorize_url( self: ERDDAP, categorize_by: str, value: OptionalStr = None, response: OptionalStr = None, ) -> str: """Build the categorize URL for the `server` endpoint. Args: ---- categorize_by: a valid attribute, e.g. ioos_category or standard_name. Valid attributes are shown in http://erddap.ioos.us/erddap/categorize page. value: an attribute value. response: default is HTML. Returns: ------- url: the categorized URL for the `response` chosen. """ response = response if response else self.response return get_categorize_url(self.server, categorize_by, value, response)
[docs] def get_download_url( # noqa: PLR0913 self: ERDDAP, *, dataset_id: OptionalStr = None, protocol: OptionalStr = None, variables: OptionalList = None, dim_names: OptionalList = None, response: OptionalStr = None, constraints: OptionalDict = None, distinct: OptionalBool = False, ) -> str: """Build the download URL for the `server` endpoint. Args: ---- dataset_id: a dataset unique id. protocol: tabledap or griddap. variables (list/tuple): a list of the variables to download. dim_names (list/tuple): a list of the dimensions (griddap only). response (str): default is HTML. constraints (dict): download constraints, default None (opendap). distinct (bool): if true, only unique values will be downloaded. Example: ------- constraints = { 'latitude<=': 41.0, 'latitude>=': 38.0, 'longitude<=': -69.0, 'longitude>=': -72.0, 'time<=': '2017-02-10T00:00:00+00:00', 'time>=': '2016-07-10T00:00:00+00:00', } One can also use relative constraints like: constraints = { 'time>': 'now-7days', 'latitude<': 'min(longitude)+180', 'depth>': 'max(depth)-23', } Returns: ------- url (str): the download URL for the `response` chosen. """ dataset_id = dataset_id if dataset_id else self.dataset_id protocol = protocol if protocol else self.protocol variables = variables if variables else self.variables dim_names = dim_names if dim_names else self.dim_names response = response if response else self.response constraints = constraints if constraints else self.constraints if not dataset_id: msg = f"Please specify a valid `dataset_id`, got {dataset_id}" raise ValueError(msg) if not protocol: msg = f"Please specify a valid `protocol`, got {protocol}" raise ValueError(msg) if ( protocol == "griddap" and constraints is not None and variables is not None and dim_names is not None ): # Check that dimensions, constraints, # and variables are valid for this dataset. _griddap_check_constraints(constraints, self._constraints_original) _griddap_check_variables(variables, self._variables_original) return get_download_url( self.server, dataset_id=dataset_id, protocol=protocol, variables=variables, dim_names=dim_names, response=response, constraints=constraints, distinct=distinct, )
[docs] def to_pandas( self: ERDDAP, requests_kwargs: dict | None = None, **kw: dict, ) -> pd.DataFrame: """Save a data request to a pandas.DataFrame. Accepts any `pandas.read_csv` keyword arguments, passed as a dictionary to pandas_kwargs. This method uses the .csvp [1] response as the default for simplicity, please check ERDDAP's docs for the other csv options available. [1] Download a ISO-8859-1 .csv file with line 1: name (units). Times are ISO 8601 strings. requests_kwargs: kwargs to be passed to urlopen method. **kw: kwargs to be passed to third-party library (pandas). """ response = kw.pop("response", "csvp") distinct = kw.pop("distinct", False) url = self.get_download_url(response=response, distinct=distinct) return to_pandas( url, requests_kwargs=requests_kwargs, pandas_kwargs=dict(**kw), )
[docs] def to_ncCF( # noqa: N802 self: ERDDAP, protocol: OptionalStr = None, **kw: dict, ) -> netCDF4.Dataset: """Load the data request into a CF compliant netCDF4-python object.""" distinct = kw.pop("distinct", False) protocol = protocol if protocol else self.protocol url = self.get_download_url(response="ncCF", distinct=distinct) return to_ncCF(url, protocol=protocol, requests_kwargs=dict(**kw))
[docs] def to_xarray( self: ERDDAP, requests_kwargs: dict | None = None, **kw: dict, ) -> xr.Dataset: """Load the data request into a xarray.Dataset. Accepts any `xr.open_dataset` keyword arguments. """ if self.response == "opendap": response = "opendap" elif self.protocol == "griddap": response = "nc" else: response = "ncCF" distinct = kw.pop("distinct", False) url = self.get_download_url(response=response, distinct=distinct) if requests_kwargs: requests_kwargs = {"auth": self.auth, **requests_kwargs} else: requests_kwargs = {"auth": self.auth} return to_xarray( url, response, requests_kwargs, xarray_kwargs=dict(**kw), )
[docs] def to_iris(self: ERDDAP, **kw: dict) -> iris.cube.CubeList: """Load the data request into an iris.cube.CubeList. Accepts any `iris.load_raw` keyword arguments. """ response = "nc" if self.protocol == "griddap" else "ncCF" distinct = kw.pop("distinct", False) url = self.get_download_url(response=response, distinct=distinct) return to_iris(url, iris_kwargs=dict(**kw))
def _get_variables_uncached( self: ERDDAP, dataset_id: OptionalStr = None, ) -> dict: if not dataset_id: dataset_id = self.dataset_id if dataset_id is None: msg = f"You must specify a valid dataset_id, got {dataset_id}" raise ValueError(msg) url = self.get_info_url(dataset_id=dataset_id, response="csv") variables = {} data = urlopen(url, self.requests_kwargs) _df = pd.read_csv(data) self._dataset_id = dataset_id for variable in set(_df["Variable Name"]): attributes = ( _df.loc[ _df["Variable Name"] == variable, ["Attribute Name", "Value"], ] .set_index("Attribute Name") .to_dict()["Value"] ) variables.update({variable: attributes}) return variables
[docs] def get_var_by_attr( self: ERDDAP, dataset_id: OptionalStr = None, **kwargs: dict, ) -> list[str]: """Return a variable based on its attributes. The `get_var_by_attr` method will create an info `csv` return, for the `dataset_id`, and the variables attribute dictionary, similar to netCDF4-python `get_variables_by_attributes`. Examples -------- >>> e = ERDDAP(server_url="https://gliders.ioos.us/erddap") >>> dataset_id = "whoi_406-20160902T1700" Get variables with x-axis attribute. >>> e.get_var_by_attr(dataset_id, axis="X") ['longitude'] Get variables with matching "standard_name" attribute >>> e.get_var_by_attr( ... dataset_id, standard_name="northward_sea_water_velocity" ... ) ['v'] Get Axis variables >>> axis = lambda v: v in ["X", "Y", "Z", "T"] >>> e.get_var_by_attr(dataset_id, axis=axis) ['latitude', 'longitude', 'time', 'depth'] """ variables = self._get_variables(dataset_id=dataset_id) # Virtually the same code as the netCDF4 counterpart. vs = [] has_value_flag = False for vname in variables: var = variables[vname] for k, v in kwargs.items(): if callable(v): has_value_flag = v(var.get(k, None)) if has_value_flag is False: break elif var.get(k) and var.get(k) == v: has_value_flag = True else: has_value_flag = False break if has_value_flag is True: vs.append(vname) return vs
[docs] def download_file( self: ERDDAP, file_type: str, ) -> str: """Download the dataset to a file in a user specified format.""" file_type = file_type.lstrip(".") if file_type not in download_formats: msg = f"Requested filetype {file_type} not available on ERDDAP" raise ValueError(msg) url = _sort_url(self.get_download_url(response=file_type)) fname_hash = hashlib.shake_256(url.encode()).hexdigest(5) file_name = Path(f"{self.dataset_id}_{fname_hash}.{file_type}") if not file_name.exists(): urlretrieve(url, file_name) # noqa: S310 return file_name