Source code for gepard.data

"""Representation of experimental data.

DataPoint -- class for kinematical points, possibly representing measurements
DataSet   -- container for DataPoint instances

dset -- dictionary with public datasets
"""
from __future__ import annotations

import copy
import importlib_resources
import math
import os
import re
from typing import Union, List

import numpy as np
import pandas as pd

from . import kinematics
from .constants import Mp, Mp2
from .datasets import (DIS, en2engamma, ep2epgamma, gammastarp2gammap,
                       gammastarp2Mp)

process_class_map = {
        'ep2epgamma': 'DVCS',
        'en2engamma': 'DVCS',
        'gammastarp2gammap': 'DVCS',
        'gammastarp2rho0p': 'DVMP',
        'gammastarp2phip': 'DVMP',
        'gammastarp2Vp': 'DVMP',
        'dis': 'DIS'
        }



[docs]
def loaddata(resource):
    """Return dictionary {id : DataSet, ...}  out of files in resource package."""
    data = {}
    files = importlib_resources.files(resource).iterdir()
    for file in files:
        if file.suffix == '.dat':
            dataset = DataSet(datafile=file.read_text())
            for pt in dataset:
                pt.to_conventions()
            data[dataset.id] = dataset
    return data




[docs]
class KinematicsError(Exception):
    """Exception thrown when kinematics is unphysical."""
    pass




[docs]
class DataPoint(dict):
    """Kinematical point. May correspond to actual measurement.

    Args:
        xB (float): Bjorken x_B
        t (float): momentum transfer to target squared
        Q2 (float): Q^2
        phi (float): azimuthal angle
        FTn (int): harmonic of azimuthal angle, e.g. -1 for sin(phi)
        observable (str): name of the measured observable
        val (float): measurement value
        err (float): total uncertainty of `val`
        units (dict): pysical units of variables
        kindict (dict): for old, alternative passing of kinematics values,
                 like ``g.DataPoint({'xB': 0.1, 't': -0.2, 'Q2': 4.0})``

    There are other args as well.

    Examples:
            >>> pt = g.DataPoint(xB=0.1, t=-0.2, Q2=4.0)

    Todo:
        `kindict` is temporarily kept for backward compatibility.
        You should not rely on it.

    """


[docs]
    def __init__(self, kindict: dict = None, **kwargs):
        # Just list some allowed attributes to help mypy:
        # -- Kinematics --
        self.xB = None
        self.xi = None
        self.j = None
        self.t = None
        self.Q2 = None
        # from https://stackoverflow.com/questions/4984647/
        super(DataPoint, self).__init__()
        self.__dict__ = self
        if kindict:
            self.update(kindict)
        self.update(kwargs)
        # calculate other determined kinematic variables:
        _fill_kinematics(self)



[docs]
    def prepare(self):
        """Pre-calculate some functions of kinematics."""
        kinematics.prepare(self)
        return



[docs]
    def copy(self):
        """Copy the DataPoint object."""
        # Do we need copy.deepcopy?
        new = copy.copy(self)
        new.__dict__ = new
        return new



[docs]
    def update_from_grid(self, gridline, dataset):
        """Take data gridline, and update point atributes.

        Args:
            gridline (list): list constructed from one row of data grid in data file.
            dataset (:obj:`DataSet`): container that is to contain this `DataPoint`

        Note:
            It is assumed that data gridline is of the form::

               x1  x2 ....   y1  y1stat y1syst

            where `y1syst` need not be present, or can have one or two values,
            `syst+` and `syst-` (Further elements are ignored.)

        Todo:
            This passing of higher-level DataSet as argument sounds wrong!
            (See comment about aquisition in class docstring.)

        """
        # from https://stackoverflow.com/questions/4984647/
        # we want accessibility via both attributes and dict keys
        super(DataPoint, self).__init__()
        self.__dict__ = self
        self.errtypes = ['err', 'errminus', 'errplus', 'errstat', 'errsyst', 'errnorm']
        # 1. Put reference to container into attribute
        self.dataset = dataset
        # 2. Put data into attributes
        # 2a. first acquire also attributes of parent DataSet
        self.update(dataset.__dict__)
        # 2b. x-axes
        for name in self.xnames:
            nameindex = int(name[1:].split('name')[0])  # = 1, 0, 2, ...
            xname = getattr(self, name)  # = 't', 'xB', ...
            xval = getattr(self, 'x' + str(nameindex) + 'value')  # =  0.1
            if isinstance(xval, float) or isinstance(xval, int):
                # we have global instead of grid value
                setattr(self, xname, xval)    # pt.xB = 0.1, pt.FTn = 1, ...
            else:
                # take value from the grid
                columnindex = int(xval.split('column')[1])-1  # = 1, 0, 2, ...
                setattr(self, xname, gridline[columnindex])  # pt.xB = gridline[1]
        # 2c. y-axis
        self.val = gridline[int(self.y1value.split('column')[1])-1]
        # 2d. y-axis errors
        if 'y1error' in self:  # we are given total error already
            self.err = gridline[int(self.y1error.split('column')[1])-1]
            self.errplus = self.err
            self.errminus = self.err
        else:
            # we have to add various contributions. We do addition of variances:
            # stat = statsym + max(stat+,stat-)
            # syst_uncorr = systsym + max(syst+,syst-)
            # syst = syst_uncorr + norm
            # err = stat + syst   # This is used for fitting chisq
            # Following two are used for pulls:
            # err+ = statsym + stat+ + systsym + syst+ + norm
            # err- = statsym + stat- + systsym + syst- + norm
            varstat = 0
            varsyst = 0  # uncorrelated syst err
            varsym = 0
            varplus = 0
            varminus = 0
            varnorm = 0
            # 1. statistical error
            if 'y1errorstatistic' in self:
                es = gridline[int(self.y1errorstatistic.split('column')[1])-1]**2
                varstat += es
                varsym += es
            if 'y1errorstatisticplus' in self:
                ep = gridline[int(self.y1errorstatisticplus.split('column')[1])-1]**2
                em = gridline[int(self.y1errorstatisticminus.split('column')[1])-1]**2
                varstat += max(ep, em)
                varplus += ep
                varminus += em
            # 2. systematic error
            if 'y1errorsystematic' in self:
                es = gridline[int(self.y1errorsystematic.split('column')[1])-1]**2
                varsyst += es
                varsym += es
            if 'y1errorsystematicplus' in self:
                ep = gridline[int(self.y1errorsystematicplus.split('column')[1])-1]**2
                em = gridline[int(self.y1errorsystematicminus.split('column')[1])-1]**2
                varsyst += max(ep, em)
                varplus += ep
                varminus += em
        # 3. normalization error (specified as percentage)
            if 'y1errornormalization' in self:
                varnorm += (self.y1errornormalization * self.val)**2
            # 4. TOTAL errors
            self.errplus = math.sqrt(varsym + varplus + varnorm)
            self.errminus = math.sqrt(varsym + varminus + varnorm)
            self.errstat = math.sqrt(varstat)
            self.errsyst = math.sqrt(varsyst + varnorm)
            self.errnorm = math.sqrt(varnorm)
            # FIXME: One needs to make a choice here and we go conservative
            self.err = math.sqrt(varstat + varsyst + varnorm)
            # alternative:
            # self.err = (self.errplus+self.errminus)/2.
        # 2e. calculate standard kinematical variables
        _fill_kinematics(self)
        # 2f. polarizations
        # Unpolarized in1 particle
        if 'in1polarization' not in self:
            self.in1polarization = 0
        # For transversaly polarized target set, if needed and
        # if not already set, by default take dominant sine-varphi harmonic
        if ('in2polarizationvector' in self and self.in2polarizationvector == 'T' and
                'varFTn' not in self):
            self.varFTn = -1
        return


    def __repr__(self):
        """Print something useful."""
        try:
            return "DataPoint: " + self.observable + " = " + str(self.val)
        except AttributeError:
            return "DataPoint"


[docs]
    def to_conventions(self):
        """Transform datapoint into gepard's conventions."""
        if hasattr(self, 'val'):
            self.origval = self.val  # to remember it for later convenience
        if hasattr(self, 'errtypes'):
            for errtype in self.errtypes:
                if hasattr(self, errtype):
                    setattr(self, 'orig'+errtype, getattr(self, errtype))
        # C1. azimutal angle phi should be in radians.
        if 'phi' in self and hasattr(self, 'units') and self.units['phi'][:3] == 'deg':
            self.phi = self.phi * math.pi / 180.
            self.newunits['phi'] = 'rad'
        # C2. phi_{Trento} -> (pi - phi_{BKM})
        if 'frame' in self and self.frame == 'Trento':
            if 'phi' in self:
                self.phi = math.pi - self.phi
            elif 'FTn' in self:
                if self.FTn == 1 or self.FTn == 3 or self.FTn == -2:
                    self.val = - self.val
        # C3. varphi_{Trento} -> (varphi_{BKM} + pi)
            if 'varphi' in self:
                self.varphi = self.varphi - math.pi
            elif 'varFTn' in self:
                if self.varFTn == 1 or self.varFTn == -1:
                    self.val = - self.val
                else:
                    raise ValueError('varFTn = {} not allowed. Only +/-1!'.format(
                                     self.varFTn))
            self.newframe = 'BMK'
        # C4. cross-sections should be in nb
        if hasattr(self, 'units') and self.units[self.observable] == 'pb/GeV^4':
            self.val = self.val/1000
            for errtype in self.errtypes:
                if hasattr(self, errtype):
                    err = getattr(self, errtype)
                    setattr(self, errtype, err/1000)
            self.newunits[self.observable] = 'nb/GeV^4'



[docs]
    def from_conventions(self):
        """Transform stuff from Approach's conventions into original data's."""
        # C4. cross-sections should be in nb
        if hasattr(self, 'units') and self.units[self.observable] == 'pb/GeV^4':
            self.val = self.val*1000
            for errtype in self.errtypes:
                if hasattr(self, errtype):
                    err = getattr(self, errtype)
                    setattr(self, errtype, err*1000)
        # C2. phi_{BKM} -> (pi - phi_{Trento})
        if 'frame' in self and self.frame == 'Trento':
            if 'phi' in self:
                self.phi = math.pi - self.phi
            elif 'FTn' in self:
                if self.FTn == 1 or self.FTn == 3:
                    self.val = - self.val
        # C3. varphi_{Trento} -> (varphi_{BKM} + pi)
            if 'varphi' in self:
                self.varphi = self.varphi + math.pi
            elif 'varFTn' in self:
                if self.varFTn == 1 or self.varFTn == -1:
                    self.val = - self.val
            self.newframe = 'Trento'
        # C1. azimutal angle phi back to degrees
        if 'phi' in self and hasattr(self, 'units') and self.units['phi'][:3] == 'deg':
            self.phi = self.phi / math.pi * 180.



[docs]
    def orig_conventions(self, val):
        """Like from_conventions, but for the prediction val."""
        # This doesn't touches self
        # C4. cross-sections nb --> pb
        if hasattr(self, 'units') and self.units[self.observable] == 'pb/GeV^4':
            val = val*1000
        # C2. phi_{BKM} --> (pi - phi_{Trento})
        if 'frame' in self and self.frame == 'Trento' and 'FTn' in self:
            if self.FTn == 1 or self.FTn == 3 or self.FTn == -2:
                val = - val
        if 'frame' in self and self.frame == 'Trento' and 'varFTn' in self:
            if self.varFTn == 1 or self.varFTn == -1:
                val = - val
        return val





[docs]
class DataSet(list):
    """A container for `DataPoint` instances.

    Args:
        datapoints (list): list of :obj:`DataPoint` instances
        datafile (str): data file to be parsed, represented as string

    Either take explicit list of DataPoints or get them by parsing datafile.

    Information that is common to all data of a given dataset (i.e.
    which is contained in a preamble of datafile is accessible
    via attributes:

    Attributes:
        observable (str): name of observable measured
        collaboration (str): name of experimenatal collaboration
        units (dict): pysical units of variables
        newunits (dict): internal pysical units of variables

    There are other attributes as well.

    """

[docs]
    def __init__(self, datapoints=None, datafile=None):
        if datapoints:
            list.__init__(self, datapoints)
        else:
            # we have datafile
            list.__init__(self)
            preamble, data = self.parse(datafile)

            # Create needed attributes before creating `DataPoint`s
            # Preamble stuff goes into attributes
            for key in preamble:
                try:  # try to convert to number everything that is number
                    setattr(self, key, _str2num(preamble[key]))
                except ValueError:  # rest stays as is
                    setattr(self, key, preamble[key])

            #  Extracting names of x-axes variables and observable
            #  xnames = ['x1name', 'x2name', ...], not necessarily sorted!
            #  xaxes = ['t', 'xB', ...]
            self.xnames = [key for key in preamble if re.match(r'^x\dname$', key)]
            self.xaxes = [preamble[key] for key in self.xnames]
            self.observable = preamble['y1name']
            # Good to have:
            self.filename = os.path.split(datafile)[-1]
            # Following dictionary will contain units for everything
            # i.e.  {'phi' : 'degrees', 't' : 'GeV^2', ...}
            self.units = dict((preamble[key], preamble[key[:2]+'unit'])
                              for key in self.xnames)
            self.units[self.observable] = preamble['y1unit']
            # Following dictionary will have units which are changed so that match
            # units used for internal theoretical formulas
            self.newunits = {}
            # charge of first particle FIXME: just electron treated
            if self.in1particle in ['e+', 'ep']:    # positron
                self.in1charge = +1
            elif self.in1particle in ['e', 'e-', 'em']:
                self.in1charge = -1
            # Mandelstam s, if specified
            try:
                if self.process in ['ep2epgamma', 'en2engamma', 'dis']:
                    if self.exptype == 'fixed target':
                        self.s = 2 * Mp * self.in1energy + Mp2
                    elif self.exptype == 'collider':
                        self.s = 2 * self.in1energy * (self.in2energy + math.sqrt(
                            self.in2energy**2 - Mp2)) + Mp2
                    else:
                        pass  # FIXME: raise error
            except AttributeError:
                pass
            if hasattr(self, 'process'):
                self.process_class = process_class_map[self.process]

            for gridline in data:
                pt = DataPoint()
                pt.update_from_grid(gridline, self)
                self.append(pt)


    def __add__(self, rhs):
        """Add datasets.

        Args:
            rhs (:obj:`DataSet`): dataset to be appended

        Returns:
            joined dataset

        http://stackoverflow.com/questions/8180014/how-to-subclass-python-list-without-type-problems.

        """
        return DataSet(datapoints=list.__add__(self, rhs))

    def __repr__(self):
        """Pretty-print dataset."""
        return 'DataSet with {} points'.format(len(self))

    def __getitem__(self, key):
        """Get an element of dataset i.e. datapoint."""
        # From https://stackoverflow.com/questions/2936863/
        if isinstance(key, slice):
            lst = [self[k] for k in range(*key.indices(len(self)))]
            tmp = DataSet(lst)
            tmp.__dict__ = self.__dict__.copy()  # transfer the attributes
            return tmp
        elif isinstance(key, int):
            return list.__getitem__(self, key)
        else:
            raise TypeError("Invalid argument type.")

    def __getslice__(self, start, end):
        """Get a range of elements."""
        # This is called by [:], while [::] calls __getitem__()
        if start >= len(self):
            raise IndexError("""%s has only %d items and your slice
                starts at %d""" % (self, len(self), start))
        return DataSet(self[start:end:None])


[docs]
    def parse(self, dataFile):
        """Parse `dataresource` string and return tuple (preamble, data).

        Args:
            dataFile (str): datafile to be parsed, as string

        Returns:
            (preamble, data) (tuple):

        `preamble` is dictionary obtained by converting datafile preamble
        items into dictionary items like this::

            y1 = AC from datafile goes into   {'y1' : 'AC', ...}

        `data` is actual numerical grid of experimental data converted
        into list of lists.

        """
        # [First] parsing the formatted ASCII file
        desc = {}   # description preamble (reference, kinematics, ...)
        data = []   # actual data grid  x1 x2  ... y1 dy1_stat dy1_syst ...
        for dataFileLine in dataFile.splitlines():
            # remove comments
            dataFileLine = dataFileLine.split('#')[0]
            # only lines with '=' (premble) or with numbers only (data grid) are parsed
            if re.search(r'=', dataFileLine):
                # converting preamble line into dictionary item
                desctpl = tuple([s.strip() for s in dataFileLine.split("=")])
                desc[desctpl[0]] = desctpl[1]
            if re.match(r'([ \t]*[-\.\d]+[ \t\r]+)+', dataFileLine):
                data.append([float(num) for num in dataFileLine.split()])

        return desc, data



[docs]
    def df(self):
        """Return pandas DataFrame of a DataSet."""
        attrs = ['observable', 'collaboration', 'id', 'x', 'eta',
                 'xi', 'xB', 'Q2', 't', 'tm', 'in1energy', 'W',
                 'phi', 'FTn', 'varFTn', 'val',
                 'err', 'errminus', 'errplus', 'errstat', 'errsyst', 'errnorm']
        dat = []
        for pt in self:
            row = []
            for atr in attrs:
                if hasattr(pt, atr):
                    row.append(getattr(pt, atr))
                else:
                    row.append(None)
            row.append(pt)
            dat.append(row)
        return pd.DataFrame(dat, columns=attrs+['pt', ])





[docs]
def _str2num(s: str) -> Union[int, float]:
    """Convert string to number, taking care if it should be int or float.

    Args:
        s: string to be converted

    Returns:
        number Union[int, float]

    http://mail.python.org/pipermail/tutor/2003-November/026136.html

    """
    if "." in s:
        return float(s)
    else:
        return int(s)




[docs]
def _complete_xBWQ2(kin):
    """Make trio {xB, W, Q2} complete if two of them are given in 'kin'."""
    if 'W' in kin and 'Q2' in kin and 'xB' not in kin:
        kin.xB = kin.Q2 / (kin.W**2 + kin.Q2 - Mp2)
    elif 'xB' in kin and 'Q2' in kin and 'W' not in kin:
        kin.W = math.sqrt(kin.Q2 / kin.xB - kin.Q2 + Mp2)
    elif 'xB' in kin and 'W' in kin and 'Q2' not in kin:
        kin.Q2 = kin.xB * (kin.W**2 - Mp2) / (1. - kin.xB)
    else:
        raise KinematicsError('Exactly two of {xB, W, Q2} should be given.')
    return




[docs]
def _complete_tmt(kin):
    """Make duo {t, tm} complete if one of them is given in 'kin'."""
    if 't' in kin and 'tm' not in kin:
        assert kin.t <= 0
        kin.tm = - kin.t
    elif 'tm' in kin and 't' not in kin:
        assert kin.tm >= 0
        kin.t = - kin.tm
    else:
        raise KinematicsError('Exactly one of {t, tm} should be given.')
    return




[docs]
def _fill_kinematics(kin, old={}):
    """Update kinematics in place.

    Complete set of kinematical variables is {xB, t, Q2, W, s, xi, tm, phi}.
    Using standard identities, missing values are calculated, if possible, first
    solely from values given in 'kin', and then, second, using values in 'old',
    if provided.

    Args:
        kin (:obj:`DataPoint`): datapoint to be updated
        old (:obj:`DataPoint`): extra kinematical info

    """
    kkeys = set(kin.keys())
    trio = set(['xB', 'W', 'Q2'])
    if len(trio.intersection(kkeys)) == 3:
        raise KinematicsError('Overdetermined set {xB, W, Q2} given.')
    elif len(trio.intersection(kkeys)) == 2:
        _complete_xBWQ2(kin)
    elif len(trio.intersection(kkeys)) == 1 and old:
        given = trio.intersection(kkeys).pop()  # one variable given in 'kin'
        # We treat only the case when one of {xB, Q2} is given and second is
        # then taken from 'old'
        if given == 'xB':
            kin.Q2 = old.Q2
        elif given == 'Q2':
            kin.xB = old.xB
        _complete_xBWQ2(kin)
    else:
        # We have zero givens, so take all three from 'old'
        if old:
            for key in trio:
                kin.__setattr__(key, old.__getattribute__(key))
    # FIXME: xi is just fixed by xB - it cannot be given by user
    # There are t/Q2 corrections, cf. BMK Eq. (4), but they are
    # formally higher twist and it is maybe sensible to DEFINE xi,
    # the argument of CFF, as follows:
    if 'xB' in kin:
        kin.xi = kin.xB / (2. - kin.xB)
    duo = set(['t', 'tm'])
    if len(duo.intersection(kkeys)) == 2:
        raise KinematicsError('Overdetermined set {t, tm=-t} given.')
    elif len(duo.intersection(kkeys)) == 1:
        _complete_tmt(kin)
    else:
        # We have zero givens, so take both from 'old'
        if old:
            for key in duo:
                kin.__setattr__(key, old.__getattribute__(key))
    # s is just copied from old, if there is one
    if old and 's' in old:
        kin.s = old.s
    # phi and varphi are copied from old, if possible and necessary
    if 'phi' not in kin and 'phi' in old:
        kin.phi = old.phi
    if 'varphi' not in kin and 'varphi' in old:
        kin.varphi = old.varphi




[docs]
def select(dataset: DataSet, criteria: List[str] = [], logic='AND'):
    """Filter point of dataset satisfying criteria.

    Args:
        dataset: DataSet or any collection of DataPoints
        criteria: list of strings describing selection criteria
        logic: 'AND' or 'OR': how to logically combine criteria

    Returns:
        DataSet of DataPoints satisfying criterion.

    Example:
        criteria=['xB > 0.1', 'observable == ALU']

    """
    selected = []
    for pt in dataset:
        if logic == 'OR':
            for criterion in criteria:
                if eval('pt.'+criterion):
                    selected.append(pt)
        elif logic == 'AND':
            ok = True
            for criterion in criteria:
                if not eval('pt.'+criterion):
                    ok = False
                    break
            if ok:
                selected.append(pt)
    # convert list to DataSet instance
    tmp = DataSet(selected)
    tmp.__dict__ = dataset.__dict__.copy()  # transfer the attributes
    return tmp




[docs]
def list_data(ids):
    """List basic info about datasets specified by id numbers."""
    if not isinstance(ids, list):
        ids = [ids]
    for id in ids:
        try:
            dt = dset[id]
            ref = dt.reference.replace('arXiv:', '').replace('hep-ex', '').replace(
                  'nucl-ex', '').replace(
                  'from Morgan Murray, draft_90@hermes.desy.de, J. Burns and M. Murray',
                  'Morgan M.').replace('v1', '').replace(
                            'F. Ellinghaus, QCD02', 'Frank E.').replace(
                                    'PRELIMINARY', 'prelim.').strip('[]/ ')
            try:
                ref2 = dt.reference2
            except AttributeError:
                ref2 = ''
            print('[%3i] %8s %3i %9s %10s %s' % (
                  dt.id, dt.collaboration, len(dt), dt.observable, ref, ref2))
        except KeyError:
            pass




[docs]
def describe_data(pts):
    """Print observables and where they come from."""
    all = []
    print("{:2s} x {:6s}  {:6s}  {:4s}   {:3s} {:12s}".format(
     'npt', 'obs', 'collab', 'FTn', 'id', 'ref.'))
    print(46*'-')
    for pt in pts:
        props = []
        for prop in ['observable', 'collaboration', 'FTn', 'id', 'reference']:
            if hasattr(pt, prop):
                props.append(str(getattr(pt, prop)))
            else:
                props.append('N/A')
        all.append(tuple(props))
    tot = len(all)
    uniqs = set(all)
    cc = 0
    for uniq in sorted(uniqs):
        n = all.count(uniq)
        cc += n
        print("{:2d} x {:6s}  {:6s}  {:4s}   {:3s} {:12s}".format(n, *uniq))
    assert cc == tot
    print(46*'-')
    print("TOTAL = {}".format(tot))



[docs]
def FTF(data, testdata=None, cosmax=None, sinmax=None, inverse=False):
    """Fourier series fit to data (takes and returns pandas dataframe).

    cosmax, sinmax  - index of highest harmonics
    inverse = True  - data values ARE harmonics, calculate function(phi)

    """
    N = len(data)
    # If length of series is not specified by the user, use maximal one
    #   which makes this fit equal to DFT
    if not cosmax and not (cosmax == 0):
        cosmax = int((N-1)/2.)  # maximal cos harmonic
    if not sinmax and not (sinmax == 0):
        sinmax = cosmax + ((N-1) % 2)  # maximal sin harmonic
    nharm = 1 + cosmax + sinmax  # number of harmonics
    if isinstance(testdata, pd.DataFrame):
        A = np.array([[np.cos(n*phi) for n in range(cosmax+1)] + [np.sin(n*phi) for n in range(1,sinmax+1)] for phi in testdata.phi.values])
    else:
        # take phis from first argument data
        A = np.array([[np.cos(n*phi) for n in range(cosmax+1)] + [np.sin(n*phi) for n in range(1,sinmax+1)] for phi in data.phi.values])
    B = data.val.values
    if not inverse:
        res = np.linalg.lstsq(A, B, rcond=None)
        #print "Sum of residuals = {}".format(res[1])
        z = np.zeros(N-len(res[0]), dtype=A.dtype)
        vals = np.concatenate((res[0], z), axis=0)
        df = pd.DataFrame({'phi': data.phi.values, 'val': vals})
        #print "Number of harmonics = {}".format(nharm)
    else:
        if isinstance(testdata, pd.DataFrame):
            res = np.dot(A, B[:nharm])
            df = pd.DataFrame({'phi': testdata.phi.values, 'val': testdata.val.values, 'pred': res})
        else:
            # we invert on phis of first argument
            res = np.dot(A, B[:nharm])
            df = pd.DataFrame({'phi': data.phi.values, 'val': res})
    return df




[docs]
def FTFMC(data, nsamples=100, cosmax=None, sinmax=None, inverse=False):
    """Fourier series fit to data with MC error propagation (takes and returns pandas dataframe).
    Args:
        cosmax: index of highest cosine harmonic
        sinmax: index of highest sine harmonics
        inverse: if True data values ARE harmonics, calculate function(phi) (default: False)

    """
    N = len(data)
    # If length of series is not specified by the user, use maximal one
    #   which makes this fit equal to DFT
    if not cosmax and not (cosmax == 0):
        cosmax = int((N-1)/2.)  # maximal cos harmonic
    if not sinmax and not (sinmax == 0):
        sinmax = cosmax + ((N-1) % 2)  # maximal sin harmonic
    nharm = 1 + cosmax + sinmax  # number of harmonics
    A = np.array([[np.cos(n*phi) for n in range(cosmax+1)] + [np.sin(n*phi) for n in range(1,sinmax+1)] for phi in data.phi.values])
    B = np.transpose((np.ones((nsamples,N))*data.val.values + np.random.randn(nsamples,N)*data.err.values))  # replicas
    if not inverse:
        res = np.linalg.lstsq(A, B, rcond=None)
        z = np.zeros((nsamples, N-len(res[0])), dtype=A.dtype)
        vals = np.concatenate((res[0].T, z), axis=1)
        df = pd.DataFrame({'phi': data.phi.values, 'val': vals.mean(axis=0), 'err': vals.std(axis=0)})
        #print "Number of harmonics = {}".format(nharm)
    else:
        res = np.dot(A, B[:nharm])
        df = pd.DataFrame({'phi': data.phi.values, 'val': res.mean(axis=1), 'err': res.std(axis=1)})
    return df




[docs]
def cvsets(df_in, nfolds=3, shuffle=True):
        """Return n-fold cross-validation sets [(train1, valid1), (train2, valid2), ...].

        Copies data.

        """
        if shuffle:
            df = df_in.reindex(np.random.permutation(df_in.index))
        else:
            df = df_in.copy()
        # stolen from sklearn
        n = len(df)
        fold_sizes = (n // nfolds) * np.ones(nfolds, dtype=int)
        fold_sizes[:n % nfolds] += 1
        current = 0
        chunks = []
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            #yield obj.idxs[start:stop]
            chunks.append(df[start:stop])
            current = stop
        sets = []
        for f in range(nfolds):
            train = pd.concat([ch for j, ch in enumerate(chunks) if j != f])
            sets.append([train, chunks[f]])
        return sets




[docs]
def FTanalyse(bins, HMAX=3, nf=4, Nrep=1):
    """Determine the number of harmonics present in bins according to n-fold cross-validation.

    Args:
        bins: dictionary with bins
        HMAX: highest harmonic used in searches
        NS: number of replicas for MC error propagation
        Nrep: number of CV repetitions (probably wrong to make > 1)
        nf: number of folds for cross-validation

    """
    mins = []
    for nn in range(Nrep):
        for k in bins:
            df = bins[k]
            err_min = 100
            for CM in range(HMAX):
                for SM in range(HMAX):
                    try:
                        errs = []
                        for cvtrain, cvtest in cvsets(df, nfolds=nf):
                            cvtest.reset_index(drop=True, inplace=True)
                            dfFT = FTF(cvtrain, cosmax=CM, sinmax=SM)
                            dftest = FTF(dfFT, testdata=cvtest, cosmax=CM, sinmax=SM, inverse=True)
                            errs.append(np.sum((dftest.val.values-dftest.pred.values)**2))
                        errs = np.array(errs)
                        # We divide by npts and not ndof because test set
                        # is NOT used for fitting, so there should be no penalization
                        # of model's complexity
                        err = errs.mean() / len(cvtest)
                        # However, experiments show that small penalty for complexity
                        # sometimes improves accuracy (but this is quite simple
                        # and creates problems with large number of folds)
                        #err = errs.mean() /(len(cvtest)-np.sqrt(CM+SM+1))
                        #print CM, SM, err
                        if err <= err_min:
                            err_min = err
                            h_min = (CM, SM)
                    except ValueError:
                        # asking too large harmonic from small-statistics bin
                        # FIXME: switch this to "look before leap" philosophy
                        pass
            mins.append(h_min)
    nc, ns = np.array(mins).mean(axis=0)
    delc, dels = np.array(mins).std(axis=0)
    print("Highest extractable cos harmonic = {:.3f} +- {:.3f}".format(nc, delc))
    print("Highest extractable sin harmonic = {:.3f} +- {:.3f}\n".format(ns, dels))
    return int(np.round(nc)), int(np.round(ns))


# Load all public datasets and make them globaly available
dset = loaddata(ep2epgamma)
dset.update(loaddata(gammastarp2Mp))
dset.update(loaddata(gammastarp2gammap))
dset.update(loaddata(en2engamma))
dset.update(loaddata(DIS))