Source code for pyLARDA.Connector

#!/usr/bin/python3

import os, sys, time
import glob
import copy
import re
import datetime
import calendar
import pprint
import functools
import pprint as pprint2
from pathlib import Path

from typing import Callable

import pyLARDA.NcReader as NcReader
import pyLARDA.RPGReader as RPGReader
import pyLARDA.ParameterInfo as ParameterInfo
#import pyLARDA.DataBuffer as DataBuffer
#import pyLARDA.MeteoReader as MeteoReader
#import pyLARDA.Spec as Spec
import pyLARDA.peakTree as peakTree
import pyLARDA.trace_reader as trace_reader
import pyLARDA.helpers as h
import pyLARDA.Transformations as Transf

import numpy as np
from operator import itemgetter
import collections
import json
import requests, msgpack
from tqdm import tqdm
#import cbor2

import logging
logger = logging.getLogger(__name__)

DATEstrfmt = "%Y%m%d-%H%M%S"


[docs]
def convert_regex_date_to_dt(re_date): 
    """convert a re_date dict to datetime

    .. warning::

        When using 2 digit years (i.e. RPG) a 20 will
        be added in front

    Args:
        re_date (dict): result of the regex search with keys
    Returns:
        datetime
    """
    l = []
    if len(re_date['year']) == 2:
        re_date['year'] = '20' + re_date['year']
    for k in ['year', 'month', 'day', 'hour', 'minute', 'second']:
        if k in re_date.keys() and re_date[k] is not None:
            l.append(int(re_date[k]))
        elif k=='day':
            l.append(1)
    return datetime.datetime(*l)




[docs]
def convert_to_datestring(datepattern, f):
    """convert the date in a (file-)string to dt

    Args:
        datepatttern: a python regex definition with named groups
        f: the string
    Returns:
        datetime
    """
    try:
        dt = convert_regex_date_to_dt(
            re.search(datepattern, f).groupdict())
    except AttributeError:
        logger.warning(f'No matching data pattern "{datepattern}" in file: "{f}"')
        return -1

    return dt.strftime(DATEstrfmt)




[docs]
def setupreader(paraminfo) -> Callable:
    """obtain the reader from the paraminfo

    """

    if paraminfo["ncreader"] == 'timeheight_limrad94':
        reader = NcReader.timeheightreader_rpgfmcw(paraminfo)
    elif paraminfo["ncreader"] in ['spec_rpg94binary', 'timeheight_rpg94binary', 'time_rpg94binary']:
        reader = RPGReader.rpgfmcw_binary(paraminfo)
    elif paraminfo["ncreader"] in ['time_hatprobinary', 'timeheight_hatprobinary']:
        reader = RPGReader.hatpro_binary(paraminfo)
    elif paraminfo["ncreader"] == 'spec_limrad94':
        reader = NcReader.specreader_rpgfmcw(paraminfo)
    elif paraminfo["ncreader"] == 'spec_rpgpy':
        reader = NcReader.specreader_rpgpy(paraminfo)
    elif paraminfo["ncreader"] == 'spec_kazr':
        reader = NcReader.specreader_kazr(paraminfo)
    elif paraminfo["ncreader"] in ['aux', 'aux_all_ts', 'aux_ts_slice']:
        reader = NcReader.auxreader(paraminfo)
    elif paraminfo["ncreader"] in ['scan_timeheight', 'scan_time']:
        reader = NcReader.scanreader_mira(paraminfo) 
    elif paraminfo['ncreader'] == 'peakTree':
        reader = peakTree.peakTree_reader(paraminfo)
    elif paraminfo['ncreader'] == 'trace':
        reader = trace_reader.trace_reader(paraminfo)
    elif paraminfo['ncreader'] == 'trace2':
        reader = trace_reader.trace_reader2(paraminfo)
    elif paraminfo["ncreader"] == 'pollyraw':
        reader = NcReader.reader_pollyraw(paraminfo)
    elif paraminfo["ncreader"] == 'mrrpro_spec':
        paraminfo.update({"ncreader": "spec", "compute_velbins":"mrrpro"})
        reader = NcReader.reader(paraminfo)
    elif paraminfo["ncreader"] == "wyoming_sounding_txt":
        reader = NcReader.reader_wyoming_sounding(paraminfo)
    elif paraminfo["ncreader"] == 'psd':
        reader = NcReader.psd_reader(paraminfo)
    elif paraminfo["ncreader"] == 'timeheight_with_groups':
        reader = NcReader.reader_with_groups(paraminfo)
    else:
        reader = NcReader.reader(paraminfo)

    return reader




[docs]
def setup_valid_date_filter(valid_dates) -> Callable:
    """validator function for chunks of valid dates
    
    Args:
        valid_dates: list of [begin, end] in 'YYYYMMDD'
    
    Returns:
        a single argument ('YYYYMMDD-HHMMSS') validator function
    """
    def date_filter(e):
        datepair, f = e
        f_b, f_e = datepair
        #print(valid_dates, datepair, f_b, f_e)
        #print([(f_b >= valid[0] and f_e <= valid[1]) for valid in valid_dates])
        return any([(f_b[:-7] >= valid[0] and f_e[:-7] <= valid[1]) for valid in valid_dates])

    return date_filter




[docs]
def path_walk(top, prefilter='.*', topdown = False, followlinks = False):
    """pendant for os.walk
    """


    current_level = len(str(top).split('/'))

    filter_at_level = '/'.join(prefilter.split('/')[:current_level])
    regex = re.compile(filter_at_level)
    #print(current_level, filter_at_level)
    #print(str(top).split('/'))
    #print(str(prefilter).split('/'))
    #stime = time.time()
    #names = list(top.iterdir())
    #print(" {:5.3f}s".format(time.time() - stime))

    #stime = time.time()
    names = [f for f in top.iterdir() if regex.search(str(f))]
    #print(" {:5.3f}s".format(time.time() - stime))
    #print(len(names), names[:30])
    
    dirs = [node for node in names if node.is_dir() is True]
    nondirs = [node for node in names if node.is_dir() is False]

    if topdown:
        yield top, dirs, nondirs

    for name in dirs:
        if followlinks or name.is_symlink() is False:
            for x in path_walk(name, prefilter, topdown, followlinks):
                yield x

    if topdown is not True:
        yield top, dirs, nondirs



def end_1sec_earlier(date):
    dt = datetime.datetime.strptime(date, DATEstrfmt)
    return (dt-datetime.timedelta(seconds=1)).strftime(DATEstrfmt)



[docs]
def guess_end(dates):
    """estimate the end of a file
    
    Returns:
        list of pairs [begin, end]
    """
    if len(dates) > 1:
        guessed_duration = (datetime.datetime.strptime(dates[-1], DATEstrfmt) - 
            datetime.datetime.strptime(dates[-2], DATEstrfmt))
    else:
        guessed_duration = datetime.timedelta(seconds=(24*60*60)-1)
    # quick fix guessed duration not longer than 24 h
    if guessed_duration >= datetime.timedelta(days=1):
        guessed_duration = datetime.timedelta(seconds=(24*60*60)-1)
    last_d = (
        datetime.datetime.strptime(dates[-1], DATEstrfmt) + guessed_duration
    ).strftime(DATEstrfmt)
    ends = [end_1sec_earlier(d) for d in dates[1:]] + [last_d]
    return list(zip(dates, ends))




[docs]
class Connector_remote:
    """connect the data (from the a remote source) to larda

    Args:
        camp_name (str): campaign name
        system (str): system identifier
        plain_dict (dict): connector meta info
        uri (str): address of the remote source
    """
    def __init__(self, camp_name, system, plain_dict, uri):
        self.camp_name = camp_name
        self.system = system
        self.params_list = list(plain_dict['params'].keys())
        print(self.system, self.params_list)
        self.plain_dict = plain_dict
        self.uri = uri


[docs]
    def collect(self, param, time_interval, *further_intervals, **kwargs) -> dict:
        """collect the data from a parameter for the given intervals

        Args:
            param (str) identifying the parameter
            time_interval: list of begin and end datetime
            *further_intervals: range, velocity, ...
            **interp_rg_join: interpolate range during join

        Returns:
            data_container
        """
        resp_format = 'msgpack'
        interval = ["-".join([str(h.dt_to_ts(dt)) for dt in time_interval])]
        interval += ["-".join([str(i) for i in pair]) for pair in further_intervals]
        stream = True if resp_format == "msgpack" else False
        params = {"interval": ','.join(interval), 'rformat': resp_format}
        params.update(kwargs)
        resp = requests.get(self.uri + '/api/{}/{}/{}'.format(self.camp_name, self.system, param),
                            params=params, stream=stream)
        logger.debug("fetching data from: {}".format(resp.url))
        if resp_format == "msgpack":
            block_size = 1024
            pbar = tqdm(unit="B", total=(int(resp.headers.get('content-length', 0))//block_size)*block_size, unit_divisor=1024, unit_scale=True)
            content = bytearray()
            for data in resp.iter_content(block_size):
                content.extend(data)
                pbar.update(len(data))
        
        if resp.status_code != 200:
            if resp_format == "msgpack":
                print("Error at Backend")
                print(content.decode("unicode_escape"))
            else:
                print(resp.json())
            raise ConnectionError("bad status code of response {}".format(resp.status_code))

        starttime = time.time()
        # if resp_format == 'bin':
        #     data_container = cbor2.loads(resp.content)
        if resp_format == 'msgpack':
            logger.info("msgpack version {}".format(msgpack.version))
            if msgpack.version[0] < 1:
                data_container = msgpack.loads(content, encoding='utf-8')
            else:
                data_container = msgpack.loads(content, strict_map_key=False)
        elif resp_format == 'json':
            data_container = resp.json()

        #print("{:5.3f}s decode data".format(time.time() - starttime))
        starttime = time.time()
        for k in ['ts', 'rg', 'vel', 'var', 'mask', 'vel_ch2', 'vel_ch3', 'aux']:
            if k in data_container and type(data_container[k]) == list:
                data_container[k] = np.array(data_container[k])
        logger.info("loaded data container from remote: {}".format(data_container.keys()))
        #print("{:5.3f}s converted to np arrays".format(time.time() - starttime))
        return data_container




[docs]
    def description(self, param):
        """get the description str"""
        resp = requests.get(self.uri + '/description/{}/{}/{}'.format(self.camp_name, self.system, param))
        if resp.status_code != 200:
            raise ConnectionError("bad status code of response {}".format(resp.status_code))

        logger.warning(resp.text)
        return resp.text 




[docs]
    def get_as_plain_dict(self) -> dict:
        """put the most important information of the connector into a plain dict (for http tranfer)"""

        return self.plain_dict





[docs]
def walk_str(pathinfo):
    """match the names and subdirs with regex using string

    only works for unix systems, but is reasonably fast 
    
    Args:
        pathinfo: dict

    Returns:
        all_files
    """

    assert os.name == 'posix', 'walk_str only works with string based filepath'
    
    all_files = []
    current_regex = pathinfo['matching_subdirs'] if 'matching_subdirs' in pathinfo else ''
    current_re = re.compile(current_regex)
    prefilter = pathinfo['base_dir'] + pathinfo['prefilter_subdirs'] if 'prefilter_subdirs' in pathinfo else '.*'

    for root, d, files in os.walk(pathinfo['base_dir'], topdown=True):
        #print('walk ', root, len(list(files)), files[:10])
        root = root[:-1] if (root[-1] == '/') else root
        current_level = len(root.split('/'))
        filter_at_level = '/'.join(prefilter.split('/')[:current_level+1])
        regex = re.compile(filter_at_level)
        #print('root           ', root)
        #print('filter at level', filter_at_level)
        #print(str(root).split('/'))
        #print(str(prefilter).split('/'))
        #print('d before', len(d), d)
        #print([f"{root}/{f}" for f in d])
        d[:] = [f for f in d if regex.search(f"{root}/{f}")]
        #print('d after', len(d), d)
        #abs_filepaths = [f for f in files if re.search(current_regex, str(f))]
        #abs_filepaths = [f for f in files if current_re.search(f)]
        abs_filepaths = [f"{root}/{f}" for f in files if current_re.search(f"{root}/{f}")]
        #logger.debug("valid_files {} {}".format(root, [f for f in files if re.search(current_regex, str(f))]))
        #print("skipped_files {} {}".format(root, [f for f in files if not re.search(current_regex, str(f))]))
        all_files += abs_filepaths
        #files = [f for f in os.listdir('.') if re.match(r'[0-9]+.*\.jpg', f)]
    return all_files





[docs]
def walk_pathlib(pathinfo):
    """match the names and subdirs with regex using pathlib
    
    should give cross-platform compatability, but comes with a performance penalty (>3x)
    
    Args:
        pathinfo: dict

    Returns:
        all_files
    """
    
    all_files = []
    current_regex = pathinfo['matching_subdirs'] if 'matching_subdirs' in pathinfo else ''
    current_re = re.compile(current_regex)
    prefilter = pathinfo['base_dir'] + pathinfo['prefilter_subdirs'] if 'prefilter_subdirs' in pathinfo else '.*'
    for root, _, files in path_walk(Path(pathinfo['base_dir']), prefilter):
        print('walk ', root, len(list(files)), files[:10])
        #abs_filepaths = [f for f in files if re.search(current_regex, str(f))]
        abs_filepaths = [f for f in files if current_re.search(str(f))]
        #logger.debug("valid_files {} {}".format(root, [f for f in files if re.search(current_regex, str(f))]))
        #print("skipped_files {} {}".format(root, [f for f in files if not re.search(current_regex, str(f))]))
        all_files += abs_filepaths
        #files = [f for f in os.listdir('.') if re.match(r'[0-9]+.*\.jpg', f)]
    return all_files





[docs]
class Connector:
    """connect the data (from the ncfiles/local sources) to larda


    Args:
        system (str): system identifier
        system_info (dict): dict info loaded from toml
        valid_dates (list of lists): list of begin and end datetime
        description_dir (optional): dir with the description rst
    """
    def __init__(self, system, system_info, valid_dates, description_dir=None):
        self.system = system
        self.system_info = system_info
        self.valid_dates = valid_dates
        self.params_list = list(system_info["params"].keys())
        self.description_dir = description_dir
        logger.info("params in this connector {} {}".format(self.system, self.params_list))
        logger.debug('connector.system_info {}'.format(system_info))

    def __str__(self):
        s = "connector for system {} \ncontains parameters: ".format(self.system)
        s += " ".join(self.params_list)
        return s


[docs]
    def build_filehandler(self):
        """scrape the directories and build the filehandler
        """
        pathdict = self.system_info['path']

        filehandler = {}
        for key, pathinfo in pathdict.items():

            # 1. match the names and subdirs with regex
            #all_files = walk_pathlib(pathinfo)
            all_files = walk_str(pathinfo)
    
            # remove basedir (not sure if that is a good idea)
            all_files = [str(p).replace(pathinfo['base_dir'], "./") for p in all_files]
            #logger.debug('filelist {} {}'.format(len(all_files), all_files[:10]))

            # 2. extract the dates with another regex
            dates = [convert_to_datestring(pathinfo["date_in_filename"], str(f))\
                     for f in all_files]
            all_files = [f for _, f in sorted(zip(dates, all_files), key=lambda pair: pair[0])]
            dates = sorted(dates)

            # 3. estimate the duration a file covers
            date_pairs = guess_end(dates) if dates else []
            
            # 4. validate with the durations
            valid_date_filter = setup_valid_date_filter(self.valid_dates)
            singlehandler = list(filter(
                valid_date_filter, 
                list(zip(date_pairs, all_files))))
            
            filehandler[key] = singlehandler
        #pprint.pprint(filehandler)
        self.filehandler = filehandler 




[docs]
    def save_filehandler(self, path, camp_name):
        """save the filehandler to json file"""
        savename = 'connector_{}.json'.format(self.system)
        pretty = {'indent': 2, 'sort_keys':True}
        #pretty = {}

        if not os.path.isdir(path+'/'+camp_name):
            os.makedirs(path+'/'+camp_name)

        with open(path+'/'+camp_name+'/'+savename, 'w') as outfile:
                json.dump(self.filehandler, outfile, **pretty)
                logger.info('saved connector to {}/{}/{}'.format(path,camp_name,savename))



[docs]
    def load_filehandler(self, path, camp_name):
        """load the filehandler from the json file"""
        filename = "connector_{}.json".format(self.system)
        starttime = time.time()
        with open(path+'/'+camp_name+'/'+filename) as json_data:
                self.filehandler = json.load(json_data)
        logger.info("read in json filehandler {}: {}".format(self.system, time.time() - starttime))




[docs]
    def collect(self, param, time_interval, *further_intervals, **kwargs) -> dict:
        """collect the data from a parameter for the given intervals

        Args:
            param (str) identifying the parameter
            time_interval: list of begin and end datetime
            *further_intervals: range, velocity, ...
            **interp_rg_join: interpolate range during join

        Returns:
            data_container
        """
        
        paraminfo = self.system_info["params"][param]
        if 'interp_rg_join' not in paraminfo:
            # default value
            paraminfo['interp_rg_join'] = False
        if 'interp_rg_join' in kwargs:
            paraminfo['interp_rg_join'] = kwargs['interp_rg_join']
        base_dir = self.system_info['path'][paraminfo['which_path']]["base_dir"]
        logger.debug("paraminfo at collect {}".format(paraminfo))
        if len(time_interval) == 2:
            begin, end = [dt.strftime(DATEstrfmt) for dt in time_interval]
            # cover all three cases: 1. file only covers first part
            # 2. file covers middle part 3. file covers end
            #print(begin, end)
            flist = [e for e in self.filehandler[paraminfo['which_path']] \
                     if (e[0][0] <= begin < e[0][1])
                     or (e[0][0] > begin and e[0][1] < end)
                     or (e[0][0] <= end <= e[0][1])]
            assert len(flist) > 0, "no files available"
        elif len(time_interval) == 1:
            begin = time_interval[0].strftime(DATEstrfmt)
            flist = [e for e in self.filehandler[paraminfo['which_path']] if e[0][0] <= begin < e[0][1]]
            assert len(flist) == 1, "flist too long or too short: {}".format(len(flist))

        #[print(e, (e[0][0] <= begin and e[0][1] > begin), (e[0][0] > begin and e[0][1] < end), (e[0][0] <= end and e[0][1] >= end)) for e in flist]

        load_data = setupreader(paraminfo)
        datalist = [load_data(Path(base_dir + e[1]), time_interval, *further_intervals) for e in flist]
        # [print(e.keys) if e != None else print("NONE!") for e in datalist]
        # reader returns none, if it detects no data prior to begin
        # now these none values are filtered from the list
        assert len(datalist) > 0, 'No data found for parameter: {}'.format(param)
        datalist = list(filter(lambda x: x != None, datalist))
        #Transf.join(datalist[0], datalist[1])
        data = functools.reduce(Transf.join, datalist)

        return data





[docs]
    def collect_path(self, param, time_interval, *further_intervals, **kwargs) -> dict:
        """"
        
        Returns:
            data_container
        """
        assert 'paths' in kwargs, "Without filepaths reading is tricky"
        flist = kwargs['paths']
        flist = [Path(f) if  type(f) == str else f for f in flist]

        paraminfo = self.system_info["params"][param]
        load_data = setupreader(paraminfo)
        datalist = [load_data(e, time_interval, *further_intervals) for e in flist]
        assert len(datalist) > 0, 'No data found for parameter: {}'.format(param)
        datalist = list(filter(lambda x: x != None, datalist))
        #Transf.join(datalist[0], datalist[1])
        data = functools.reduce(Transf.join, datalist)

        return data



    def description(self, param) -> str:
        paraminfo = self.system_info["params"][param]
        #print('connector local paraminfo: ' + paraminfo['variable_name'])

        # Prints the nicely formatted dictionary
        # this is the python pprint function, not the larda.helpers function
        pp = pprint2.PrettyPrinter(indent=4)
        logger.info(pp.pformat(paraminfo))

        if 'description_file' not in paraminfo:
            return 'no description file defined in config'
        if self.description_dir == None:
            return 'description dir not set'
        
        description_file = self.description_dir / paraminfo['description_file']
        logger.info('load description file {}'.format(description_file))

        with open(description_file, 'r', encoding="utf-8") as f:
            descr = f.read()
        descr = "\n"+descr+"\n"
        logger.warning(descr)
        return descr        


[docs]
    def get_as_plain_dict(self) -> dict:
        """put the most important information of the connector into a plain dict (for http tranfer)

        Returns:
            connector information

            .. code::

                {params: {param_name: fileidentifier, ...},
                avail: {fileidentifier: {"YYYYMMDD": no_files, ...}, ...}
        """
        return {
            'params': {e: self.system_info['params'][e]['which_path'] for e in self.params_list},
            'avail': {k: self.files_per_day(k) for k in self.filehandler.keys()}
        }



[docs]
    def files_per_day(self, which_path) -> dict:
        """replaces ``days_available`` and ``day_available``

        Returns:
            dict with days and no of files

            .. code::

                {'YYYYMMDD': no of files, ...}
        """
        fh = self.filehandler[which_path]
        groupedby_day = collections.defaultdict(list)
        for d, f in fh:
            groupedby_day[d[0][:8]] += [f]
        no_files_per_day = {k: len(v) for k, v in groupedby_day.items()}
        return no_files_per_day