python source code of retrieve

# _*_ coding: utf-8 _*_

# Copyright (c) 2020 NMC Developers.
# Distributed under the terms of the GPL V3 License.

"""
A set of Python tools to make it earsier to extract weather station data from
the Global Historical Climatology Network Daily (GHCND).

refer to:
https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt or
ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
https://github.com/scott-hosking/get-station-data
"""

import numpy as np
import pandas as pd
import urllib.request
from datetime import datetime
from nmc_met_io.config import get_cache_file

missing_id = '-9999'


def get_ghcnd_data(my_stns, stn_md=None, reload_stnmeta=False, update=True):
    """
    Get daily average weather station data (Global) from Global Historical Climate Network Daily,
    refer to https://www.ncdc.noaa.gov/ghcn-daily-description and
    https://nbviewer.jupyter.org/github/scott-hosking/get-station-data/blob/master/Examples/ghcn_daily_data.ipynb
    
    Args:
        my_stns (DataFrame): dataframe should have the 'station' column, which give the
                             station ID.
        stn_md (DataFrame): the station metadata from get_ghcnd_stn_metadata.
        reload_stnmeta (boolean): redownload the station metadata file or not, default to False.
        update (boolean): update the data file or not, if False, the cache file will be used. default to True.
    
    Returns:
        DataFrame: the daily ghcnd obervations.

    Exampels:
    >>> my_stns = pd.DataFrame({"station": ["CHM00054511", "CHM00054527", "CHM00054616"]})
    >>> data = get_ghcnd_data(my_stns)
    """

    if stn_md is None:
        stn_md = get_ghcnd_stn_metadata(download=reload_stnmeta)
    dfs = []

    for stn_id in pd.unique(my_stns['station']):

        stn_md1 = stn_md[ stn_md['station'] == stn_id ]
        lat     = stn_md1['lat'].values[0]
        lon     = stn_md1['lon'].values[0]
        elev    = stn_md1['elev'].values[0]
        name    = stn_md1['name'].values[0]

        # download the lastest observation data to cache file
        url = 'ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all/'+stn_id+'.dly'
        cache_file = get_cache_file("pub/data/ghcn/daily/all/", stn_id+'.dly', name="GHCN")
        if update:
            if cache_file.is_file():
                cache_file.unlink()
            urllib.request.urlretrieve(url, cache_file)
        else:
            if not cache_file.is_file():
                urllib.request.urlretrieve(url, cache_file)

        # read file data
        df   = _create_DataFrame_1stn(cache_file)

        if len(pd.unique(df['station'])) == 1:
            df['lon']  = lon
            df['lat']  = lat
            df['elev'] = elev
            df['name'] = name
        else:
            raise ValueError('more than one station ID in file')

        dfs.append(df)

    df = pd.concat(dfs)

    df = df.replace(-999.0, np.nan)

    return df


def get_ghcnd_stn_metadata(fname=None, download=False):
    """
    Get the ghcnd station metadata from ghcnd-stations.txt.
    China station start with "CHM000...", like "CHM00054511"
    
    Args:
        fname (string, optional): You can specify the station metadata file. 
                                  Defaults to download the file from website.
    
    Returns:
        [type]: [description]

    Examples:
    >>> stnmd = get_ghcnd_stn_metadata()
    """
    
    if fname == None:
        fname = get_cache_file("pub/data/ghcn/daily/", "ghcnd-stations.txt", name="GHCN")
        if not fname.is_file() or download:
            url = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
            urllib.request.urlretrieve(url, fname)
    md = pd.read_fwf(fname, colspecs=[(0,12), (12,21), (21,31), (31,38), (38,69)],
                        names=['station','lat','lon','elev','name'])
    return md


def nearest_stn(df, my_x, my_y, n_neighbours=1):
    """
    Find the nestest station to the given longitude/latitude locations.
    
    Args:
        df (DataFrame): the station metadata from get_ghcnd_stn_metadata.
        my_x (numeric): longitude
        my_y (numeric): laititude
        n_neighbours (int, optional): retur n nearest stations. Defaults to 1.
    
    Returns:
        DataFrame: the nearest station metadata.
    """

    from scipy import spatial

    x = df['lon'].values
    y = df['lat'].values

    if x.min() <= my_x <= x.max():
        pass
    else:
        raise ValueError('my_x not within range of longitudes')

    if y.min() <= my_y <= y.max():
        pass
    else:
        raise ValueError('my_y not within range of latitudes')

    tree = spatial.KDTree(list(zip(x, y)))
    d, i = tree.query( [(my_x, my_y)], k=n_neighbours )

    if i.ndim == 1: index = i
    if i.ndim == 2: index = i[0]
    df1 = df.loc[index]

    return df1


def _create_DataFrame_1stn(filename, verbose=False):

    ### read all data
    lines     = np.genfromtxt(filename, delimiter='\n', dtype='str')
    nlines    = len(lines)
    linewidth = lines.dtype.itemsize

    ### initialise arrays & lists
    year    = np.zeros( nlines*31 ).astype(int)
    month   = np.zeros( nlines*31 ).astype(int)
    day     = np.zeros( nlines*31 ).astype(int)
    value   = np.zeros( nlines*31 )
    stn_id  = []
    element = []
    mflag   = []
    qflag   = []
    sflag   = []

    ### Loop through all lines in input file
    i = 0 ### start iteration from zero

    warnings = []

    for line_tmp in lines:

        ### return a string of the correct width, left-justified
        line = line_tmp.ljust(linewidth)

        ### extract values from original line
        ### each new index (i) represents a different day for this 
        ### line (i.e., year and station)
        for d in range(0,31):

            stn_id.append(line[0:11])
            year[i]    = line[11:15]
            month[i]   = line[15:17]
            day[i]     = d+1
            element_tmp = line[17:21]
            element.append(element_tmp)

            ### get column positions for daily data
            cols = np.array([21, 26, 27, 28]) + (8*d)
        
            val_tmp  = line[ cols[0]:cols[1] ]

            if val_tmp == missing_id: 
                value[i] = val_tmp
            elif element_tmp in ['PRCP', 'TMAX', 'TMIN', 'AWND', 'EVAP', 
                                'MDEV', 'MDPR', 'MDTN', 'MDTX', 
                                'MNPN', 'MXPN']:
                ### these are in tenths of a UNIT
                ### (e.g., tenths of degrees C)
                warnings.append(element_tmp+\
                        ' values have been divided by ten' + \
                        ' as specified by readme.txt')
                value[i] = np.float(val_tmp) / 10.
            else:
                value[i] = np.float(val_tmp)
            
            mflag.append(line[ cols[1] ])
            qflag.append(line[ cols[2] ])
            sflag.append(line[ cols[3] ])

            i = i + 1 ### interate by line and by day

    ### Print any warnings
    warnings = np.unique(np.array(warnings))
    if verbose ==True:
        for w in warnings: print(w)

    ### Convert to Pandas DataFrame
    df = pd.DataFrame(columns=['station', 'year', 'month', 'day',
                                'element', 'value',
                                'mflag', 'qflag', 'sflag'])

    df['station'] = stn_id
    df['year']    = year
    df['month']   = month
    df['day']     = day
    df['element'] = element
    df['value']   = value
    df['mflag']   = mflag
    df['qflag']   = qflag
    df['sflag']   = sflag

    df = df.replace(-9999.0, np.nan)

    ### Test validity of dates and add datetime column
    dt = []
    for index, row in df.iterrows():
        try:
            dt.append( datetime(row['year'], row['month'], row['day']) )
        except ValueError:
            # print('Date does not exist:'+\
            #                     str(row['year'])+'-'+\
            #                     str(row['month'])+'-'+\
            #                     str(row['day']) )
            dt.append( np.nan )

    df['date'] = dt
    df = df.dropna( subset=['date'] )

    return df