Source code for pytesmo.io.ismn.interface

# Copyright (c) 2013,Vienna University of Technology, Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the Vienna University of Technology, Department of Geodesy and Geoinformation nor the
#      names of its contributors may be used to endorse or promote products
#      derived from this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

'''
Created on Aug 5, 2013

@author: Christoph Paulik Christoph.Paulik@geo.tuwien.ac.at
'''

import pytesmo.io.ismn.metadata_collector as metadata_collector
import pytesmo.io.ismn.readers as readers
import pygeogrids.grids as grids

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


try:
    from mpl_toolkits.basemap import Basemap
    basemap_installed = True
    from matplotlib.patches import Rectangle
except ImportError:
    basemap_installed = False


[docs]class ISMNError(Exception): pass
[docs]class ISMN_station(object): """ Knows everything about the station, like which variables are measured there in which depths and in which files the data is stored. This is not completely true for the CEOP format since depth_from and depth_to are not easily knowable without parsing the whole file. For CEOP format depth_from and depth_to will only contain the phrase 'multiple' instead of the actual depth Parameters ---------- metadata : numpy.array part of the structured array from metadata_collector.collect_from_folder() which contains only fields for one station Attributes ---------- network : string network the time series belongs to station : string station name the time series belongs to latitude : float latitude of station longitude : float longitude of station elevation : float elevation of station variables : numpy.array variables measured at this station one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' depth_from : numpy.array shallower depth of layer the variable with same index was measured at depth_to : numpy.array deeper depth of layer the variable with same index was measured at sensors : numpy.array sensor names of variables filenames : numpy.array filenames in which the data is stored Methods ------- get_variables() returns the variables measured at this station get_depths(variable) get the depths in which a variable was measured at this station get_sensors(variable,depth_from,depth_to) get the sensors for the given variable, depth combination read_variable(variable,depth_from=None,depth_to=None,sensor=None) read the data for the given parameter combination """ def __init__(self, metadata): """ Goes through the passed metadata array and fills the attributes accordingly. """ self.network = None self.station = None self.latitude = None self.longitude = None self.elevation = None self.variables = [] self.depth_from = [] self.depth_to = [] self.sensors = [] self.filenames = [] for dataset in metadata: if self.network is None: self.network = dataset['network'] elif self.network != dataset['network']: raise ISMNError( "Networks in Station metadata are not the same") if self.station is None: self.station = dataset['station'] elif self.station != dataset['station']: raise ISMNError( "Station names in Station metadata are not the same") self.latitude = dataset['latitude'] self.longitude = dataset['longitude'] self.elevation = dataset['elevation'] self.variables.append(dataset['variable']) self.depth_from.append(dataset['depth_from']) self.depth_to.append(dataset['depth_to']) self.sensors.append(dataset['sensor']) self.filenames.append(dataset['filename']) self.variables = np.array(self.variables) self.depth_from = np.array(self.depth_from) self.depth_to = np.array(self.depth_to) self.sensors = np.array(self.sensors) self.filenames = np.array(self.filenames)
[docs] def get_variables(self): """ get a list of variables measured at this station Returns ------- variables : numpy.array array of variables measured at this station """ return np.unique(self.variables)
[docs] def get_depths(self, variable): """ get depths at which the given variable was measured at this station Parameters ---------- variable : string variable string best one of those returned by get_variables() or one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' Returns ------- depth_from : numpy.array depth_to : numpy.array """ if variable in self.variables: index = np.where(variable == self.variables) return self.depth_from[index], self.depth_to[index] else: return None, None
[docs] def get_sensors(self, variable, depth_from, depth_to): """ get the sensors at which the variable was measured at the given depth Parameters ---------- variable : string variable abbreviation one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' depth_from : float shallower depth of layer the variable was measured at depth_to : float deeper depth of layer the variable was measured at Returns ------- sensors : numpy.array array of sensors found for the given combination of variable and depths Raises ------ ISMNError if no sensor was found for the given combination of variable and depths """ ind_sensors = np.where((variable == self.variables) & (depth_from == self.depth_from) & (depth_to == self.depth_to))[0] if ind_sensors.size == 0: raise ISMNError("variable-depth_from-depth_to combination does not exist." "Please check which depths do exist with get_depths_for_variable") else: return self.sensors[ind_sensors]
[docs] def read_variable(self, variable, depth_from=None, depth_to=None, sensor=None): """ actually reads the given variable from the file. Parameters are required until any ambiguity is resolved. If there is only one depth for the given variable then only variable is required. If there are multiple depths at least depth_from is required. If there are multiple depth_to possibilities for one variable-depth_from combination also depth_to has to be specified. If 2 sensors are measuring the same variable in the same depth then also the sensor has to be specified. Parameters ---------- variable: string variable to read one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' depth_from : float, optional shallower depth of layer the variable was measured at depth_to : float, optional deeper depth of layer the variable was measured at sensor : string, optional name of the sensor Returns ------- data : readers.ISMNTimeSeries ISMNTimeSeries object containing the relevant metadata for the time series as well as a .data pointing to a pandas.DataFrame Raises ------ ISMNError: if not all ambiguity was resolved by the given input parameters or if no data was found for the given input parameters """ if depth_from is None: depth_f, depth_t = self.get_depths(variable) if depth_f.size > 1: raise ISMNError("there are multiple depths for this variable" "Please specify the one you want to read") elif depth_f.size == 1: depth_from = depth_f[0] elif depth_f.size == 0: raise ISMNError("there are no depths for this variable" "Something went wrong") if depth_to is None: depth_f, depth_t = self.get_depths(variable) if depth_t.size > 1: raise ISMNError("there are multiple depths with the same depth_from value" "Please specify the depth_to value you want to read") elif depth_t.size == 1: depth_to = depth_t[0] elif depth_t.size == 0: raise ISMNError("there are no depths for this variable" "Something went wrong") if sensor is None: sensors = self.get_sensors(variable, depth_from, depth_to) if sensors.size > 1: raise ISMNError("there are multiple sensors for this combination of " "variable, depth_to, depth_from. Please specify which one " "you want to read") elif sensors.size == 1: sensor = sensors[0] elif sensors.size == 0: raise ISMNError("there are no sensors for this variable, depth_from, depth_to " "combination. Please make sure you specified valid depths") index_filename = np.where((variable == self.variables) & (depth_from == self.depth_from) & (depth_to == self.depth_to) & (sensor == self.sensors))[0] if index_filename.size != 1: raise ISMNError("There is no data for this combination of variable, depth_from, " "depth_to and sensor. Please check.") else: return readers.read_data(self.filenames[index_filename[0]])
[docs] def data_for_variable(self, variable, min_depth=None, max_depth=None): """ function to go through all the depth_from, depth_to, sensor combinations for the given variable and yields ISMNTimeSeries if a match is found. if min_depth and/or max_depth where given it only returns a ISMNTimeSeries if depth_from >= min_depth and/or depth_to <= max_depth Parameters ---------- variable: string variable to read one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' min_depth : float, optional depth_from of variable has to be >= min_depth in order to be included. max_depth : float, optional depth_to of variable has to be <= max_depth in order to be included. Returns ------- time_series : iterator(pytesmo.io.ismn.readers.ISMNTimeSeries) ISMNTimeSeries object containing data and metadata """ if min_depth is None: min_depth = np.min(self.depth_from) if max_depth is None: max_depth = np.max(self.depth_to) for var, d1, d2, filename in zip(self.variables, self.depth_from, self.depth_to, self.filenames): if var != variable: continue if ((d1 >= min_depth) & (d2 <= max_depth)): yield readers.read_data(filename)
[docs] def get_min_max_obs_timestamp(self, variable="soil moisture", min_depth=None, max_depth=None): """ goes throug the filenames associated with a station and reads the date of the first and last observation to get and approximate time coverage of the station. This is just an overview. If holes have to be detected the complete file must be read. Parameters ---------- self: type description variable: string, optional one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' min_depth : float, optional depth_from of variable has to be >= min_depth in order to be included. max_depth : float, optional depth_to of variable has to be <= max_depth in order to be included. Returns ------- start_date: datetime end_date: datetime """ start_date = None end_date = None if min_depth is None: min_depth = np.min(self.depth_from) if max_depth is None: max_depth = np.max(self.depth_to) for var, d1, d2, filename in zip(self.variables, self.depth_from, self.depth_to, self.filenames): if var == variable and ((d1 >= min_depth) & (d2 <= max_depth)): sdate, edate = readers.get_min_max_timestamp(filename) if start_date is None or start_date > sdate: start_date = sdate if end_date is None or end_date < edate: end_date = edate return start_date, end_date
[docs]class ISMN_Interface(object): """ class provides interface to ISMN data downloaded from the ISMN website upon initialization it collects metadata from all files in path_to_data and saves metadata information in numpy file in folder path_to_data/python_metadata/ First initialization can take a minute or so if all ISMN data is present in path_to_data Parameters ---------- path_to_data : string filepath to unzipped ISMN data containing the Network folders network : string or list, optional provide name of network to only load the given network Raises ------ ISMNError if given network was not found in path_to_data Attributes ---------- metadata : numpy.array metadata array for all stations contained in the path given during initialization grid : pygeogrids.grid.BasicGrid Grid object used for finding nearest insitu station for given lon lat Methods ------- find_nearest_station(lon,lat) find nearest station for given coordinates """ def __init__(self, path_to_data, network=None): """ collects metadata from all files in path_to_data and saves metadata information in numpy file in folder path_to_data/python_metadata/ First initialization can take a minute or so if all ISMN data is present in path_to_data """ if not os.path.exists(os.path.join(path_to_data, 'python_metadata', 'metadata.npy')): self.metadata = metadata_collector.collect_from_folder( path_to_data) os.mkdir(os.path.join(path_to_data, 'python_metadata')) np.save( os.path.join(path_to_data, 'python_metadata', 'metadata.npy'), self.metadata) #np.savetxt(os.path.join(path_to_data,'python_metadata','metadata.npy'), self.metadata,delimiter=',') else: self.metadata = np.load( os.path.join(path_to_data, 'python_metadata', 'metadata.npy')) if network is not None: if type(network) is not list: network = [network] # initialize binary mask the size of metadata mask = np.zeros(self.metadata.shape[0], dtype=np.bool) for net in network: if net in self.metadata['network']: mask = mask | (self.metadata['network'] == net) else: raise ISMNError("Network {} not found".format(net)) self.metadata = self.metadata[mask] # initialize grid object for all stations self.grid = grids.BasicGrid(self.metadata['longitude'], self.metadata['latitude'], setup_kdTree=False)
[docs] def list_networks(self): """ returns numpy.array of networks available through the interface Returns ------- networks : numpy.array unique network names available """ return np.unique(self.metadata['network'])
[docs] def list_stations(self, network=None): """ returns numpy.array of station names available through the interface Parameters ---------- network : string, optional if network name is given only stations belonging to the network are returned Returns ------- networks : numpy.array unique network names available """ if network is None: return np.unique(self.metadata['station']) elif network in self.list_networks(): return np.unique(self.metadata[self.metadata['network'] == network]['station'])
[docs] def get_station(self, stationname, network=None): """ get ISMN_station object by station name Parameters ---------- stationname : string name of station network : string, optional network name, has to be used if stations belonging to different networks have the same name Returns ------- ISMN_station : ISMN_station object Raises ------ ISMNError if stationname was not found """ if network is not None: all_index = np.where((self.metadata['station'] == stationname) & (self.metadata['network'] == network))[0] else: all_index = np.where(self.metadata['station'] == stationname)[0] if all_index.size == 0: raise ISMNError("stationname was not found") metadatasub = self.metadata[all_index] if np.unique(metadatasub['network']).size > 1: raise ISMNError("stationname occurs in multiple networks") return ISMN_station(self.metadata[all_index])
[docs] def stations_that_measure(self, variable): """ Goes through all stations and returns those that measure the specified variable Parameters ---------- variable : string variable name one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' Returns ------- ISMN_station : ISMN_station object """ for network in self.list_networks(): for stationname in self.list_stations(network=network): station = self.get_station(stationname, network=network) if variable in station.variables: yield station
[docs] def get_dataset_ids(self, variable, min_depth=0, max_depth=0.1): """ returnes list of dataset_id's that can be used to read a dataset directly through the read_ts function """ if max_depth < min_depth: raise ValueError("max_depth can not be less than min_depth") ids = np.where((self.metadata['variable'] == variable) & (self.metadata['depth_to'] <= max_depth) & (self.metadata['depth_from'] >= min_depth))[0] return ids
[docs] def read_ts(self, idx): """ read a time series directly by the id Parameters ---------- idx : int id into self.metadata, best one of those returned from get_dataset_ids() Returns ------- timeseries : pandas.DataFrame of the read data """ ts = readers.read_data(self.metadata['filename'][idx]) return ts.data
[docs] def find_nearest_station(self, lon, lat, return_distance=False): """ finds the nearest station available in downloaded data Parameters ---------- lon : float Longitude of point lat : float Latitude of point return_distance : boolean, optional if True also distance is returned Returns ------- station : ISMN_station ISMN_station object distance : float, optional distance to station in meters, measured in cartesian coordinates and not on a great circle. Should be OK for small distances """ index, d = self.grid.find_nearest_gpi(lon, lat) all_index = np.where( self.metadata['station'] == self.metadata['station'][index]) if return_distance: return ISMN_station(self.metadata[all_index]), d else: return ISMN_station(self.metadata[all_index])
[docs] def plot_station_locations(self, axes=None): """ plots available stations on a world map in robinson projection only available if basemap is installed Parameters ---------- axes: matplotlib.Axes, optional If given then plot will be on this axes. Returns ------- fig: matplotlib.Figure created figure instance. If axes was given this will be None. axes: matplitlib.Axes used axes instance. Raises ------ ISMNError if basemap is not installed """ if basemap_installed: if axes is None: fig = plt.figure() ax = fig.add_axes([0, 0, 0.9, 1]) else: fig = None ax = axes colormap = plt.get_cmap('Set1') ismn_map = Basemap(projection='robin', lon_0=0) uniq_networks = self.list_networks() colorsteps = np.arange(0, 1, 1 / float(uniq_networks.size)) rect = [] for j, network in enumerate(uniq_networks): stations_idx = np.where(self.metadata['network'] == network)[0] unique_stations, us_idx = np.unique( self.metadata['station'][stations_idx], return_index=True) netcolor = colormap(colorsteps[j]) rect.append(Rectangle((0, 0), 1, 1, fc=netcolor)) for i, station in enumerate(unique_stations): lat, lon = self.metadata['latitude'][stations_idx[us_idx[i]]], self.metadata[ 'longitude'][stations_idx[us_idx[i]]] x, y = ismn_map(lon, lat) im = ismn_map.scatter( x, y, c=netcolor, s=10, marker='s', edgecolors='none', ax=ax) ismn_map.drawcoastlines(linewidth=0.25) ismn_map.drawcountries(linewidth=0.25) ismn_map.drawstates(linewidth=0.25) plt.legend( rect, uniq_networks.tolist(), loc='lower center', ncol=uniq_networks.size / 4) return fig, ax else: raise ISMNError('Basemap is not installed.')
[docs] def get_min_max_obs_timestamps(self, variable="soil moisture", min_depth=None, max_depth=None): """ get minimum and maximum timestamps per station Parameters ---------- self: type description variable: string, optional one of * 'soil moisture', * 'soil temperature', * 'soil suction', * 'precipitation', * 'air temperature', * 'field capacity', * 'permanent wilting point', * 'plant available water', * 'potential plant available water', * 'saturation', * 'silt fraction', * 'snow depth', * 'sand fraction', * 'clay fraction', * 'organic carbon', * 'snow water equivalent', * 'surface temperature', * 'surface temperature quality flag original' min_depth : float, optional depth_from of variable has to be >= min_depth in order to be included. max_depth : float, optional depth_to of variable has to be <= max_depth in order to be included. Returns ------- data : pd.DataFrame dataframe with multiindex Network Station and columns start_date and end_date """ networks = [] stations = [] start_dates = [] end_dates = [] for network in self.list_networks(): for stationname in self.list_stations(network=network): # append station and network names to lists for # construction of pandas mulitindex networks.append(network) stations.append(stationname) station = self.get_station(stationname, network=network) startd, endd = station.get_min_max_obs_timestamp(variable=variable, min_depth=min_depth, max_depth=max_depth) start_dates.append(startd) end_dates.append(endd) data = pd.DataFrame({"start date": start_dates, "end date": end_dates}, index=[np.array(networks), np.array(stations)]) return data