Notebook

Alphalens boilerplate

In [1]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline import factors, filters, classifiers
from quantopian.pipeline.classifiers import Classifier
from quantopian.pipeline.factors import CustomFactor, Returns, AverageDollarVolume, SimpleMovingAverage
from quantopian.pipeline.filters import StaticAssets, Q500US, Q1500US, Q3000US, QTradableStocksUS
from quantopian.pipeline.filters.fundamentals import IsPrimaryShare
from quantopian.pipeline.classifiers.fundamentals import Sector  
from quantopian.pipeline.data.builtin import USEquityPricing

import alphalens
import math
import datetime
import numpy as np
import pandas as pd

## Helper functions

def high_volume_universe(top_liquid, min_price = None, min_volume = None):  
    """
    Computes a security universe of liquid stocks and filtering out
    hard to trade ones
    Returns
    -------
    high_volume_tradable - zipline.pipeline.filter
    """
    
    if top_liquid == 'QTradableStocksUS':
        universe = QTradableStocksUS()
    elif top_liquid == 500:
        universe = Q500US()
    elif top_liquid == 1500:
        universe = Q1500US()
    elif top_liquid == 3000:
        universe = Q3000US()        
    else:        
        universe = filters.make_us_equity_universe(
            target_size=top_liquid,
            rankby=factors.AverageDollarVolume(window_length=200),
            mask=filters.default_us_equity_universe_mask(),
            groupby=Sector(),
            max_group_weight=0.3,
            smoothing_func=lambda f: f.downsample('month_start'),
        )
    
    if min_price is not None:
        price = SimpleMovingAverage(inputs=[USEquityPricing.close],
                                    window_length=21, mask=universe)
        universe &= (price >= min_price)
        
    if min_volume is not None:
        volume = SimpleMovingAverage(inputs=[USEquityPricing.volume],
                                     window_length=21, mask=universe)
        universe &= (volume >= min_volume)
        
    return universe

def run_pipeline_chunks(pipe, start_date, end_date, chunks_len = None):
    """
    Drop-in replacement for run_pipeline.
    run_pipeline fails over a very long period of time (memery usage),
    so we need to split in chunks the pipeline and concatenate the results
    """
    chunks  = []
    current = pd.Timestamp(start_date)
    end     = pd.Timestamp(end_date)
    step    = pd.Timedelta(weeks=120) if chunks_len is None else chunks_len
    
    while current <= end:
        
        current_end = current + step
        if current_end > end:
            current_end = end
        
        print 'Running pipeline:', current, ' - ', current_end
        results = run_pipeline(pipe, current.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))
        chunks.append(results)
        
        # pipeline returns more days than requested (if no trading day), so get last date from the results
        current_end = results.index.get_level_values(0)[-1].tz_localize(None)
        current = current_end + pd.Timedelta(days=1)

    return pd.concat(chunks)
       
def construct_factor_history(factor_cls, start_date='2015-10-1', end_date='2016-2-1', 
                             factor_name='factor', top_liquid=500,
                             sector_column=None):
    """
    Creates a DataFrame containing daily factor values and sector codes for a liquidity 
    constrained universe. The returned DataFrame is can be used in the factor tear sheet.
    """
    ok_universe = high_volume_universe(top_liquid)
       
    pattern_code, pattern_lag = factor_cls(mask=ok_universe)
    sector = Sector(mask=ok_universe)    
       
    pipe = Pipeline()
    pipe.add(pattern_code, factor_name)
    pipe.add(pattern_lag, 'lag')
    if sector_column is not None: # this is very slow too
        pipe.add(sector, sector_column)  
    pipe.set_screen(ok_universe)

    daily_factor = run_pipeline_chunks(pipe, start_date=start_date, end_date=end_date)
    #daily_factor = run_pipeline(pipe, start_date=start_date, end_date=end_date, chunksize=250)
       
    return daily_factor.dropna()

def get_daily_price(sid_universe, start_date, end_date, extra_days_before=0, extra_days_after=0):
    """
    Creates a DataFrame containing daily percentage returns and price
    """   
    extra_days = math.ceil(extra_days_before * 365.0/252.0) + 3 # just to be sure
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") - datetime.timedelta(days=extra_days)
    start_date = start_date.strftime("%Y-%m-%d")
    
    extra_days = math.ceil(extra_days_after * 365.0/252.0) + 3 # just to be sure
    end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=extra_days)
    end_date = end_date.strftime("%Y-%m-%d")
    
    pricing = get_pricing(sid_universe, start_date=start_date, end_date=end_date, fields='open_price')
    
    return pricing
In [2]:
import alphalens
import alphalens.performance as perf 
import alphalens.utils as utils

pattern_dict = {
  20 : 'HS',   
  2  : 'IHS', 
  10 : 'BTOP', 
  1  : 'BBOT', 
  40 : 'TTOP', 
  4  : 'TBOT', 
  30 : 'RTOP', 
  3  : 'RBOT', 
}

def run_tear_sheet(factor,
                   factor_name,
                   start_date,
                   end_date,
                   top_liquid,
                   filter_universe,
                   show_sector_plots,
                   avgretplot,
                   periods,
                   quantiles,
                   bins,
                   filter_zscore,
                   long_short,
                   prices_cache = None):
     
    sector_column = 'sector_code' if show_sector_plots else None
    days_before, days_after = (0,0)

    if avgretplot is not None:   
        days_before, days_after = avgretplot
        days_after = max(days_after, max(periods) + 1)
    
    #
    ## Run the Pipeline
    #
    print 'construct factor history'
    factorS = construct_factor_history(factor, start_date=start_date, end_date=end_date, 
                                      factor_name=factor_name, top_liquid=top_liquid,
                                      sector_column=sector_column)
    
    factorS = factorS.dropna()
    #print "Full factor ", factorS.head()
    
    for pattern, factor in factorS.groupby(by=factor_name):
        
        #print factor.describe()
        
        print "Pattern ", pattern_dict[pattern], " entries ", len(factor.index)

        #
        ## Get prices
        #
        sid_universe = set( factor.index.levels[1].unique() )
        if prices_cache is not None:
            cached_sids = set(prices_cache.columns)
            sid_universe -= cached_sids

        print 'Get pricing for %d entries' % len(sid_universe)
        if sid_universe:
            prices = get_daily_price(sid_universe, start_date=start_date, end_date=end_date, 
                                     extra_days_before=days_before, extra_days_after=days_after)
            if prices_cache is not None:
                prices = pd.concat([prices, prices_cache], axis=1)
        else:
            prices = prices_cache

        #
        ## Use Alphalens to create a factor tear sheet
        #
        print 'alphalens'    
        sectors_series = factor[sector_column] if show_sector_plots else None
        factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor=factor[factor_name],
                                                                           prices=prices,
                                                                           groupby=sectors_series,
                                                                           by_group=False,
                                                                           quantiles=quantiles,
                                                                           bins=bins,
                                                                           periods=periods,
                                                                           filter_zscore=filter_zscore,
                                                                           groupby_labels=Sector.SECTOR_NAMES)
          
        #print factor_data.describe()
        alphalens.tears.create_event_study_tear_sheet(factor_data=factor_data,
                                                      prices=prices,
                                                      avgretplot=avgretplot)
        prices_cache = prices

    return prices

Define our factor

In [3]:
from __future__ import division
from statsmodels.nonparametric.kernel_regression import KernelReg
from numpy import linspace
from scipy.signal import argrelextrema
from collections import defaultdict
import scipy.stats as stats

def _beta(stock_prices, bench_prices):
    # `linregress` returns its results in the following order:
    # slope, intercept, r-value, p-value, stderr
    regr_results = stats.linregress(y=stock_prices, x=bench_prices)
    #alpha = regr_results[1]
    beta = regr_results[0]
    #r_value = regr_results[2]
    p_value = regr_results[3]
    #stderr = regr_results[4]  
    
    # Check null hypothesis
    if p_value > 0.05:
        beta = 0.        
    return beta

def find_max_min(prices):
    prices_ = prices.copy()
    prices_.index = linspace(1., len(prices_), len(prices_))
    kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8,1])
    f = kr.fit([prices_.index.values])
    smooth_prices = pd.Series(data=f[0], index=prices.index)
    
    local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    local_min = argrelextrema(smooth_prices.values, np.less)[0]
    
    price_local_max_dt = []
    for i in local_max:
        if (i>1) and (i<len(prices)-1):
            price_local_max_dt.append(prices.iloc[i-2:i+2].argmax())

    price_local_min_dt = []
    for i in local_min:
        if (i>1) and (i<len(prices)-1):
            price_local_min_dt.append(prices.iloc[i-2:i+2].argmin())
        
    prices.name = 'price'
    maxima = pd.DataFrame(prices.loc[price_local_max_dt])
    minima = pd.DataFrame(prices.loc[price_local_min_dt])
    max_min = pd.concat([maxima, minima]).sort_index()
    max_min.index.name = 'date'
    max_min = max_min.reset_index()
    max_min = max_min[~max_min.date.duplicated()]
    p = prices.reset_index()
    max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values
    max_min = max_min.set_index('day_num').price
    
    return max_min


def find_patterns(max_min):
    patterns = defaultdict(list)

    for i in range(5, len(max_min)+1):
        window = max_min.iloc[i-5:i]

        # pattern must play out in less than 36 days
        if window.index[-1] - window.index[0] > 35:
            continue

        # Using the notation from the paper to avoid mistakes
        e1 = window.iloc[0]
        e2 = window.iloc[1]
        e3 = window.iloc[2]
        e4 = window.iloc[3]
        e5 = window.iloc[4]

        rtop_g1 = np.mean([e1,e3,e5])
        rtop_g2 = np.mean([e2,e4])
        # Head and Shoulders
        if (e1 > e2) and (e3 > e1) and (e3 > e5) and \
            (abs(e1 - e5) <= 0.03*np.mean([e1,e5])) and \
            (abs(e2 - e4) <= 0.03*np.mean([e1,e5])):
                patterns['HS'].append((window.index[0], window.index[-1]))

        # Inverse Head and Shoulders
        elif (e1 < e2) and (e3 < e1) and (e3 < e5) and \
            (abs(e1 - e5) <= 0.03*np.mean([e1,e5])) and \
            (abs(e2 - e4) <= 0.03*np.mean([e1,e5])):
                patterns['IHS'].append((window.index[0], window.index[-1]))

        # Broadening Top
        elif (e1 > e2) and (e1 < e3) and (e3 < e5) and (e2 > e4):
            patterns['BTOP'].append((window.index[0], window.index[-1]))

        # Broadening Bottom
        elif (e1 < e2) and (e1 > e3) and (e3 > e5) and (e2 < e4):
            patterns['BBOT'].append((window.index[0], window.index[-1]))

        # Triangle Top
        elif (e1 > e2) and (e1 > e3) and (e3 > e5) and (e2 < e4):
            patterns['TTOP'].append((window.index[0], window.index[-1]))

        # Triangle Bottom
        elif (e1 < e2) and (e1 < e3) and (e3 < e5) and (e2 > e4):
            patterns['TBOT'].append((window.index[0], window.index[-1]))

        # Rectangle Top
        elif (e1 > e2) and (abs(e1-rtop_g1)/rtop_g1 < 0.0075) and \
            (abs(e3-rtop_g1)/rtop_g1 < 0.0075) and (abs(e5-rtop_g1)/rtop_g1 < 0.0075) and \
            (abs(e2-rtop_g2)/rtop_g2 < 0.0075) and (abs(e4-rtop_g2)/rtop_g2 < 0.0075) and \
            (min(e1, e3, e5) > max(e2, e4)):

            patterns['RTOP'].append((window.index[0], window.index[-1]))

        # Rectangle Bottom
        elif (e1 < e2) and (abs(e1-rtop_g1)/rtop_g1 < 0.0075) and \
            (abs(e3-rtop_g1)/rtop_g1 < 0.0075) and (abs(e5-rtop_g1)/rtop_g1 < 0.0075) and \
            (abs(e2-rtop_g2)/rtop_g2 < 0.0075) and (abs(e4-rtop_g2)/rtop_g2 < 0.0075) and \
            (max(e1, e3, e5) > min(e2, e4)):
            patterns['RBOT'].append((window.index[0], window.index[-1]))
            
    return patterns


def _pattern_identification(prices, indentification_lag):
    max_min = find_max_min(prices)
    
    # we are only interested in the last pattern (if multiple patterns are there)
    # and also the last min/max must have happened less than "indentification_lag"
    # days ago otherways it mush have already been identified or it is too late to be usefull
    max_min_last_window = None
    last_max_min = None
    
    for i in reversed(range(len(max_min))):
        days_ago = prices.index[-1] - max_min.index[i]
        if days_ago <= indentification_lag:
            last_max_min = days_ago
            max_min_last_window = max_min.iloc[i-4:i+1]
            break

    if max_min_last_window is None:
        return pd.Series( {'code':np.nan, 'lag':np.nan} )
       
    # possibly identify a pattern in the selected window
    patterns = find_patterns(max_min_last_window)
    if len(patterns) != 1:
        return pd.Series( {'code':np.nan, 'lag':np.nan} )
    
    name, start_end_day_nums = patterns.iteritems().next()
    
    pattern_code = {
           'HS'   : 20,
           'IHS'  : 2,
           'BTOP' : 10,
           'BBOT' : 1,
           'TTOP' : 40,
           'TBOT' : 4,
           'RTOP' : 30,
           'RBOT' : 3,
    }
    
    return pd.Series( {'code':pattern_code[name], 'lag':last_max_min} )


class PatternFactor(CustomFactor):
    
    params = ('indentification_lag',)        
    inputs = [USEquityPricing.close]
    window_length = 40
    outputs = ['codes', 'lags']
    
    def compute(self, today, assets, out, close, indentification_lag):
        prices = pd.DataFrame(close, columns=assets)
        tmp = prices.apply(_pattern_identification, args=(indentification_lag,))
        out.codes[:] = tmp.loc['code']
        out.lags[:]  = tmp.loc['lag']


class PatternFactorBE(CustomFactor):
    
    params = ('indentification_lag',)        
    inputs = [USEquityPricing.close]
    window_length = 40
    market_sid = symbols('SPY').sid
    
    def compute(self, today, assets, out, close, indentification_lag):
        delta_days = 1
        market_sid = self.market_sid
        
        returns = (close[delta_days:] / close[:-delta_days]) - 1
        
        market_idx = assets.get_loc(market_sid)
        market_returns = returns[:,market_idx]
        
        betas = np.apply_along_axis(_beta, 0, returns, market_returns)
        returns -= (returns[:, [market_idx] ] * betas) # remove returns due to beta
        
        daily_ret = pd.DataFrame(returns, columns=assets).fillna(0.)
        cum_returns = (daily_ret + 1).cumprod() - 1 # daily percent returns -> comulative percent returns
        
        out[:] = cum_returns.apply(_pattern_identification, args=(indentification_lag,))   

Define settings

In [4]:
factor_name = 'factor'

start_date  = '2003-01-01'
end_date    = '2018-01-01'
top_liquid  = 500
filter_universe = True   # very slow, filter out untradable stocks
show_sector_plots = False # very slow to load the sector column in pipeline

# alphalens specific
periods = (1, 2, 3, 4, 5, 6, 10)
quantiles = None
bins      = [-100,100]
avgretplot  = (36, 25)   # use None to avoid plotting or (days_before, days_after)
filter_zscore = None
long_short  = False

prices_cache = None # this saves lots of time when running tear sheet multiple times

Run the tear sheet

In [5]:
def factor(mask):
    return PatternFactor(mask=mask, window_length = 42, indentification_lag=3)

prices_cache = \
run_tear_sheet( factor       = factor,
                factor_name  = factor_name,
                start_date   = start_date,
                end_date     = end_date,
                top_liquid   = top_liquid,
                filter_universe = filter_universe,
                show_sector_plots = show_sector_plots,
                avgretplot   = avgretplot,               
                periods      = periods,
                quantiles    = quantiles,
                bins         = bins,
                filter_zscore = filter_zscore,
                long_short   = long_short,
                prices_cache = prices_cache)
construct factor history
Running pipeline: 2003-01-01 00:00:00  -  2005-04-20 00:00:00
/usr/local/lib/python2.7/dist-packages/statsmodels/nonparametric/_kernel_base.py:514: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 2 but corresponding boolean dimension is 1
  dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous])
Running pipeline: 2005-04-21 00:00:00  -  2007-08-09 00:00:00
Running pipeline: 2007-08-10 00:00:00  -  2009-11-27 00:00:00
Running pipeline: 2009-11-28 00:00:00  -  2012-03-17 00:00:00
Running pipeline: 2012-03-20 00:00:00  -  2014-07-08 00:00:00
Running pipeline: 2014-07-09 00:00:00  -  2016-10-26 00:00:00
Running pipeline: 2016-10-27 00:00:00  -  2018-01-01 00:00:00
Pattern  BBOT  entries  7678
Get pricing for 1253 entries
alphalens
/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:17: DeprecationWarning: get_clean_factor_and_forward_returns: 'by_group' argument is now deprecated and replaced by 'binning_by_group'
Dropped 0.1% entries from factor data: 0.1% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Quantiles Statistics
min max mean std count count %
factor_quantile
1 1.0 1.0 1.0 0.0 7672 100.0
<matplotlib.figure.Figure at 0x7f2301a8ae50>
<matplotlib.figure.Figure at 0x7f22fc0153d0>
Pattern  IHS  entries  17259
Get pricing for 0 entries
alphalens
Dropped 0.2% entries from factor data: 0.2% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Quantiles Statistics
min max mean std count count %
factor_quantile
1 2.0 2.0 2.0 0.0 17230 100.0
<matplotlib.figure.Figure at 0x7f22e81cda90>
<matplotlib.figure.Figure at 0x7f230fcb5550>