Source code for famafrench.utils

"""
# This file is part of famafrench
# Copyright (c) 2020, Christian Jauregui <chris.jauregui@berkeley.edu>
# See file LICENSE.txt for license information.

Filename
_________
`famafrench/utils.py`

Descriptions
____________
lru_cached_method
    Wrapper for methodtools.lru_cache(maxsize) that allows for sphinx documentation's
    recognition of decorated instance methods.
get_kfpriorfactors_directly
    Directly download zipped datafiles from Ken French's online data library without
    the use of the 'pandas-datareader' package. This method is used for a select few
    number of datafiles for which 'pandas-datareader' returns an error.
timing
    Wrapper for class methods that are to be timed for speed and performance measurement.
any_in
    Provide a boolean variable that is =True if elements in a given set intersect with
    elements in another set, =False, otherwise.
priormonthToDay
    Using Fama and French's methodology, map the prior (j-k) monthly return strategy into
    a daily strategy (see online documentation provided on Ken French's website).
grouped_vwAvg
     Calculate weighted (net) portfolio return for a given portfolio with weights within a
     group or set of groups. This function is FASTER THAN groupby(...).apply(...) because
     it avoids non-optimized aggregation.
portRetAvg
    Compute a simple average across different columns.
get_statsTable
    Construct tables with formatted summary statistics.
"""

__author__ = 'Christian Jauregui <chris.jauregui@berkeley.edu'
__all__ = [
    "lru_cached_method",
    "get_kfpriorfactors_directly",
    "timing",
    "any_in",
    "priormonthToDay",
    "grouped_vwAvg",
    "portRetAvg",
    "get_statsTable",
]

# Standard Imports
import weakref
import pandas as pd
from functools import wraps
from methodtools import lru_cache  # see documentation: https://pypi.org/project/methodtools/
from time import time

# Function: lru_cached_method(.,.):
[docs]def lru_cached_method(*lru_args, **lru_kwargs):
    """
    Wrapper for :func:`methodtools.lru_cache` enabling recognition of `decorated`
    class instance methods by `Sphinx <https://www.sphinx-doc.org/en/master/>`_.

    Parameters
    __________
    *lru_args : arbitrary
        Variable number of arguments to :func:`methodtools.lru_cache`

    **lru_kwargs : arbitrary
        Keyworded, variable-length argument list for :func:`methodtools.lru_cache`

    Returns
    _______
    decorator : arbitrary `wrapped` object
        Wrapped function.
    """
    def decorator(wrapped_fn):
        @wraps(wrapped_fn)
        def wrapped(self, *args, **kwargs):
            # Use a weak reference to self. This prevents a self-reference cycle that fools the garbage collector
            # into thinking the instance shouldn't be dropped when all external references are dropped.
            weak_ref_to_self = weakref.ref(self)
            @wraps(wrapped_fn)
            @lru_cache(*lru_args, **lru_kwargs)
            def cached(*args, **kwargs):
                return wrapped_fn(weak_ref_to_self(), *args, **kwargs)
            setattr(self, wrapped_fn.__name__, cached)
            return cached(*args, **kwargs)
        return wrapped
    return decorator


# Function: get_kfpriorfactors_directly(.,.):
[docs]def get_kfpriorfactors_directly(kflib_name, kflib_freq, kf_factor):
    """
    Directly download (from Ken French's online library) zipped monthly or annual datafiles
    for the `Short-Term Reversal` or `Long-Term Reversal` Fama-French-style factors.
    This is required since the :meth:`pandas_datareader.web` method is broken for such datafiles.

    Parameters
    ___________
    kflib_name : str
        Name of zipped datafile.
    kflib_freq : str
        Observation frequency of factor portfolios. Possible choices are:

            * ``M``: monthly
            * ``A``: annual
    kf_factor : str
        The name or "label" of the Fama-French-style factor. Possible choices are:

            * ``ST_Rev`` : Short-Term Reversal - based on Prior (1-1) returns
            * ``LT_Rev`` : Long-Term Reversal - based on Prior (13-60) returns

    Returns
    ________
    kflib_data : pandas.DataFrame
        Dataset with time-series of the Fama-French-style factor.
    """
    from io import BytesIO
    from zipfile import ZipFile
    from urllib.request import urlopen

    urllink = "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"+kflib_name+"_CSV.zip"
    url = urlopen(urllink)

    # Download Zipfile and create pandas.DataFrame
    zipfile = ZipFile(BytesIO(url.read()))
    kflib_data = pd.read_csv(zipfile.open(kflib_name+".CSV"), header=0, skiprows=13)
    kflib_data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
    kflib_data.columns = list(map(str.rstrip, kflib_data.columns))

    if kf_factor not in ['ST_Rev', 'LT_Rev']:
        kf_factor = kf_factor.title()

    # Get first row number that has all values set to "nan" coinciding with the start of annual data
    rows_with_all_nan = kflib_data.index[kflib_data[kf_factor].isna()]
    diffrows_with_all_nan = [j-i for i, j in zip(rows_with_all_nan[:-1], rows_with_all_nan[1:])]
    first_nan = rows_with_all_nan[0]
    second_nan = rows_with_all_nan[diffrows_with_all_nan.index(max(diffrows_with_all_nan))+1]

    if kflib_freq == 'M':
        kflib_data = kflib_data[:first_nan]
        kflib_data.loc[:, 'Date'] = pd.to_datetime(kflib_data['Date'], format='%Y%m').dt.to_period('M')
    elif kflib_freq == 'A':
        kflib_data = kflib_data[first_nan+4:second_nan]
        kflib_data.loc[:, 'Date'] = pd.to_datetime(kflib_data['Date'], format='%Y').dt.to_period('Y')
    kflib_data = kflib_data[['Date', kf_factor]]
    kflib_data[kf_factor] = kflib_data[kf_factor].replace(-99.99, np.nan).astype(float)
    if kf_factor not in ['ST_Rev', 'LT_Rev']:
        kflib_data = kflib_data.rename(columns={kf_factor: kf_factor.title().upper()})
    kflib_data.set_index('Date', inplace=True)
    return kflib_data


# Function: timing(.)
[docs]def timing(func):
    """
    Wrapper for class instance methods enabling the timing of execution.
    Important for measuring speed and performance measurement.

    Parameters
    ___________
    func : func
        Function to be wrapped and timed following execution.

    Returns
    ________
    wrapper : arbitrary `wrapped` object
        The wrapped result(s) for the function `func`.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        t0 = time()
        result = func(*args, **kwargs)
        print("Elapsed time: ", round(time() - t0, 3), " seconds.\n")
        return result
    return wrapper


# Function: any_in(.,.)
[docs]def any_in(a_set, b_set):
    """
    Boolean variable that is ``True`` if elements in a given set `a_set` intersect
    with elements in another set `b_set`. Otherwise, the boolean is ``False``.

    Parameters
    ___________
    a_set : list
        First set of elements.
    b_set : list
        Second set of elements.

    Returns
    ________
    not set(a_set).isdisjoint(b_set) : bool
        Boolean that is ``True`` if there is a non-empty intersection between both sets.
        Otherwise, the boolean is ``False``.
    """
    return not set(a_set).isdisjoint(b_set)


# Function: priormonthToDay(.,.,.)
[docs]def priormonthToDay(freq, j_mth, k_mth):
    """
    Consistent w/ Fama and French (2008, 2016), map the prior `(j-k)` monthly return strategy into a daily strategy
    (see `Ken French's online documentation  <https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html>`_).

    Parameters
    ___________
    freq : str
        Frequency used to calculate prior `(j-k)` return strategy. Possible choices are:

            * ``D`` : daily
            * ``'M`` : monthly
    j_mth : str
        Lagged month (or day) we start measuring stock performance.
    k_mth : str
        How many months (or days) are used in measuring stock performance.

    Returns
    ________
    j_per, k_per : tuple, str
        ``j_per = j_mth`` and ``k_per = k_mth`` if ``freq = M``.
        Otherwise, monthly figures mapped to daily periods using the description found on Ken French's online documentation:

            * `Daily Momentum <https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_mom_factor_daily.html>`_.
            * `Daily Short-Term Reversal <https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_st_rev_factor_daily.html>`_.
            * `Daily Long-Term Reversal <https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_lt_rev_factor_daily.html>`_.

    Note
    ____
    Monthly ``M`` (daily ``D``) strategies involve portfolios formed every month `t-1` (or day `t-1`)
    for month `t` (or day `t`).

    Note
    ____
    The Fama and French (2008, 2016) momentum strategy definition differs from that of Jegadeesh and Titman (1993).
    Jegadeesh and Titman (1993) consider **J/K** strategies, which include portfolios formed on stock performance over the previous
    **J** months (excluding the last week or month prior to portfolio formation, to remove the
    large short-horizon reversals associated with bid-ask bounce) and hold portfolios for **K** months, where **J**, **K** :math:`\in` {3,6,9,12}.
    Future updates to this module will extend this package to include these additional momentum strategies.


    References
    ___________
    *   Fama, Eugene F., and Kenneth R. French. (2008). `Dissecting Anomalies`,
        Journal of Finance, 48(4), pp.1653-1678

    *   Fama, Eugene F., and Kenneth R. French. (2016). `Dissecting Anomalies with a Five-Factor Model`,
        Journal of Finance, 48(4), pp.1653-1678
    """
    # Per Fama and French, map prior (2-12), (1-1), and (13-60) returns
    # at the monthly frequency to the daily frequency
    if freq in ['D', 'W']:
        if (j_mth == '2') and (k_mth == '12'):
            j_per, k_per = '21', '250'
            return j_per, k_per
        elif (j_mth == '1') and (k_mth == '1'):
            j_per, k_per = '1', '20'
            return j_per, k_per
        elif (j_mth == '13') and (k_mth == '60'):
            j_per, k_per = '251', '1250'
            return j_per, k_per
        else:
            raise ValueError('\'prior (j-k)\' return strategy not of the standard Fama and French type.')
    elif freq in ['M', 'Q', 'A']:
        j_per, k_per = j_mth, k_mth
        return j_per, k_per
    else:
        raise ValueError('Please specify one of the following frequencies: \'D\' or \'M\'')


# Function: grouped_vwAvg(.,.,.,.,.)
[docs]def grouped_vwAvg(df0, col_values, col_weights, *groupby_args, **groupby_kwargs):
    """
    Calculate (net) weighted portfolio return for portfolio with weights ``col_weights`` within a group or groups.

    Parameters
    ___________
    df0 : pandas.DataFrame
        Dataset containing firm-level stock returns and portfolio weights, both indexed by dates and firm identifiers when required.
    col_values : list, str
        Column(s) to average over.
    col_weights : str
        Column containing the portfolio weights.
    group_args : list, str, [optional]
        args to pass into `groupby` (ie the level to group on).
    group_kwargs : list, str, [optional]
        kwargs to pass into `groupby`.

    Returns
    ________
    df1 : pandas.Series, or pandas.DataFrame
        Original dataset augmented w/ value-weighted returns.

    Note
    ____
    Function is **FASTER** than :func:`pandas.core.groupby.GroupBy.apply` since it avoids non-optimized aggregation.
    """
    if isinstance(col_values, str):
        col_values = [col_values]

    ss = []
    for value in col_values:
        df1 = df0.copy()
        value_x_weight = 'product_{v}_{w}'.format(v=value, w=col_weights)
        weights = 'weights_{w}'.format(w=col_weights)

        df1[value_x_weight] = df1[value] * df1[col_weights]
        df1[weights] = df1[col_weights].where(~df1[value_x_weight].isnull())
        df1 = df1.groupby(*groupby_args, **groupby_kwargs).sum()
        s = df1[value_x_weight] / df1[weights]

        s.name = value
        ss.append(s)
    df1 = pd.concat(ss, axis=1) if len(ss) > 1 else ss[0]
    return df1


# Function: portRetAvg(.)
[docs]def portRetAvg(df):
    """
    Compute a simple average across different columns.

    Parameters
    ___________
    df : pandas.DataFrame
        Dataset with columns to average over.

    Returns
    ________
    dfavg : pandas.DataFrame
        Dataset with the averaged columns.
    """
    dfavg = df.sum(axis=1, min_count=len(df.columns)) / len(df.columns)
    return dfavg


# Function: get_statsTable(.,.,.,.)
[docs]def get_statsTable(dType, dFreq, df, dates_as_index=True, ptiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]):
    """
    Construct detailed tables with summary statistics.

    Parameters
    ___________
    dType : str
        Dataset type of the portfolios. Possible choices are:

            * ``Returns``
            * ``Factors``
            * ``NumFirms``
            * ``Characs``
    dFreq : str
        Observation frequency of the portfolios. Possible choices are:

            * ``D`` : daily
            * ``W`` : weekly
            * ``M`` : monthly
            * ``Q`` : quarterly (3-months)
            * ``A`` : annual
    df : pandas.DataFrame
        Dataset w/ portfolio returns (which may include factor returns), number of firms in each portfolio,
        or `average` anomaly portfolio characteristics for a given portfolio sorting strategy.
    dates_as_index : bool
        Flag determining whether ``df`` has a :class:`pandas.DatetimeIndex` index (``dates_as_index = True``).
        Otherwise, ``dates_as_index = False``.
    ptiles : list, float, default [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]
        List of percentiles (in decimal format) included as part of output results.


    Returns
    ________
    statsTable : pandas.DataFrame
        Summary statistics of the dataset including the following:

            * number of observations :meth:`pandas.DataFrame.count`
            * sample mean :meth:`pandas.DataFrame.mean`
            * sample standard deviation :meth:`pandas.DataFrame.std`
            * sample min :meth:`pandas.DataFrame.min`
            * sample max :meth:`pandas.DataFrame.max`
            * sample skewness :meth:`pandas.DataFrame.skew`
            * sample kurtosis :meth:`pandas.DataFrame.kurtosis`
            * sample mean absolute deviation :meth:`pandas.DataFrame.mad`
            * sample percentiles :meth:`numpy.percentile`

        If ``dates_as_index = True``, then the table also includes the starting and ending date for each observation type.
    """
    statsTable = df.describe(percentiles=ptiles).round(2)
    statsTable = statsTable.append(df.reindex(statsTable.columns, axis=1).agg(['skew', 'kurt', 'mad'])).round(2)

    # Find starting and ending date of each observation type
    if dates_as_index:
        statsTable0 = pd.DataFrame(index=['startdate', 'enddate'], columns=df.columns)
        for col in df.columns:
            sdate = df[col].first_valid_index()
            edate = df[col].index.max()
            statsTable0.loc['startdate', col] = sdate
            statsTable0.loc['enddate', col] = edate
        statsTable = statsTable0.append(statsTable)

    if dType in ['Factors', 'Returns']:
        if dates_as_index:
            statsTable.iloc[3:, :] = statsTable.iloc[3:, :].astype(str) + '%'
        else:
            statsTable.iloc[1:, :] = statsTable.iloc[1:, :].astype(str) + '%'
        statsTable.loc['count', :] = statsTable.loc['count', :].apply('{:.0f}'.format)
    elif dType == 'NumFirms':
        for c in statsTable.columns:
            if dates_as_index:
                statsTable.loc[2:, c] = statsTable.loc[:, c].apply('{:.0f}'.format)
            else:
                statsTable.loc[:, c] = statsTable.loc[:, c].apply('{:.0f}'.format)
    else:
        statsTable.loc['count', :] = statsTable.loc['count', :].apply('{:.0f}'.format)
    print('    *********************** Observation frequency: '+dFreq+' ************************')
    print(statsTable, '\n')
    return statsTable