Notebook

Alphalens boilerplate

In [ ]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline import factors, filters, classifiers
from quantopian.pipeline.classifiers import Classifier
from quantopian.pipeline.factors import CustomFactor, Returns, AverageDollarVolume, SimpleMovingAverage, Latest
from quantopian.pipeline.filters import StaticAssets, Q500US, Q1500US, Q3000US, QTradableStocksUS
from quantopian.pipeline.filters.fundamentals import IsPrimaryShare
from quantopian.pipeline.classifiers.fundamentals import Sector  
from quantopian.pipeline.data.builtin import USEquityPricing

import alphalens
import math
import datetime
import numpy as np
import pandas as pd

## Helper functions

def high_volume_universe(top_liquid, min_price = None, min_volume = None):  
    """
    Computes a security universe of liquid stocks and filtering out
    hard to trade ones
    Returns
    -------
    high_volume_tradable - zipline.pipeline.filter
    """
    
    if top_liquid == 'QTradableStocksUS':
        universe = QTradableStocksUS()
    elif top_liquid == 500:
        universe = Q500US()
    elif top_liquid == 1500:
        universe = Q1500US()
    elif top_liquid == 3000:
        universe = Q3000US()        
    else:        
        universe = filters.make_us_equity_universe(
            target_size=top_liquid,
            rankby=factors.AverageDollarVolume(window_length=200),
            mask=filters.default_us_equity_universe_mask(),
            groupby=Sector(),
            max_group_weight=0.3,
            smoothing_func=lambda f: f.downsample('month_start'),
        )
    
    if min_price is not None:
        price = SimpleMovingAverage(inputs=[USEquityPricing.close],
                                    window_length=21, mask=universe)
        universe &= (price >= min_price)
        
    if min_volume is not None:
        volume = SimpleMovingAverage(inputs=[USEquityPricing.volume],
                                     window_length=21, mask=universe)
        universe &= (volume >= min_volume)
        
    return universe
      
def construct_factor_history(factor_cls, start_date='2015-10-1', end_date='2016-2-1', 
                             factor_name='factor', top_liquid=500,
                             sector_column=None):
    """
    Creates a DataFrame containing daily factor values and sector codes for a liquidity 
    constrained universe. The returned DataFrame is can be used in the factor tear sheet.
    """
    ok_universe = high_volume_universe(top_liquid)
       
    factor = factor_cls(mask=ok_universe)
    sector = Sector(mask=ok_universe)    
       
    pipe = Pipeline()
    pipe.add(factor, factor_name)
    if sector_column is not None: # this is very slow too
        pipe.add(sector, sector_column)  
    pipe.set_screen(ok_universe)

    daily_factor = run_pipeline(pipe, start_date=start_date, end_date=end_date, chunksize=512)
       
    return daily_factor.dropna()

def get_daily_price(sid_universe, start_date, end_date, extra_days_before=0, extra_days_after=0):
    """
    Creates a DataFrame containing daily percentage returns and price
    """   
    extra_days = math.ceil(extra_days_before * 365.0/252.0) + 3 # just to be sure
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") - datetime.timedelta(days=extra_days)
    start_date = start_date.strftime("%Y-%m-%d")
    
    extra_days = math.ceil(extra_days_after * 365.0/252.0) + 3 # just to be sure
    end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=extra_days)
    end_date = end_date.strftime("%Y-%m-%d")
    
    pricing = get_pricing(sid_universe, start_date=start_date, end_date=end_date, fields='open_price')
    
    return pricing

#
# 'run_tear_sheet' glues all the function together to make life easier to run the tear sheet on a pipeline factor
#

def run_tear_sheet(factor,
                   factor_name,
                   start_date,
                   end_date,
                   top_liquid,
                   show_sector_plots,
                   avgretplot,
                   periods,
                   quantiles,
                   bins,
                   filter_zscore,
                   long_short,
                   prices_cache = None):
     
    sector_column = 'sector_code' if show_sector_plots else None
    days_before, days_after = (0,0)

    if avgretplot is not None:   
        days_before, days_after = avgretplot
        days_after = max(days_after, max(periods) + 1)
    
    #
    ## Run the Pipeline
    #
    print 'construct factor history'
    factor = construct_factor_history(factor, start_date=start_date, end_date=end_date, 
                                      factor_name=factor_name, top_liquid=top_liquid,
                                      sector_column=sector_column)
    #
    ## Get prices
    #
    sid_universe = set( factor.index.levels[1].unique() )
    if prices_cache is not None:
        cached_sids = set(prices_cache.columns)
        sid_universe -= cached_sids
        
    print 'Get pricing for %d entries' % len(sid_universe)
    if sid_universe:
        prices = get_daily_price(sid_universe, start_date=start_date, end_date=end_date, 
                                 extra_days_before=days_before, extra_days_after=days_after)
        if prices_cache is not None:
            prices = pd.concat([prices, prices_cache], axis=1)
    else:
        prices = prices_cache

    #
    ## Use Alphalens to create a factor tear sheet
    #
    print 'Alphalens'
    
    if len(np.isinf(factor[factor_name])) > 0:
        print 'Dropping inf or -inf values from factor'
        factor[factor_name] = factor[factor_name].replace([np.inf, -np.inf], np.nan)
    
    sectors_series = factor[sector_column] if show_sector_plots else None
    factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor=factor[factor_name],
                                                                       prices=prices,
                                                                       groupby=sectors_series,
                                                                       by_group=False,
                                                                       quantiles=quantiles,
                                                                       bins=bins,
                                                                       periods=periods,
                                                                       filter_zscore=filter_zscore,
                                                                       groupby_labels=Sector.SECTOR_NAMES)

    if avgretplot:
        alphalens.tears.create_event_returns_tear_sheet(factor_data=factor_data,
                                                        prices=prices,
                                                        avgretplot=avgretplot,
                                                        long_short=long_short,
                                                        by_group=show_sector_plots)

    #alphalens.tears.create_full_tear_sheet(factor_data=factor_data,
    #                                       long_short=long_short,
    #                                       group_adjust=False,
    #                                       by_group=show_sector_plots)
    alphalens.plotting.plot_quantile_statistics_table(factor_data)
    alphalens.tears.create_returns_tear_sheet(factor_data=factor_data,
                                              long_short=long_short,
                                              by_group=show_sector_plots)

    
    return prices

ML factor, the machine learning factor we want to test

In [ ]:
#
# ML factor
#

from sklearn import linear_model, decomposition, ensemble, preprocessing, isotonic, metrics
from collections import OrderedDict
from scipy import stats

def shift_mask_data(X, Y, upper_percentile=70, lower_percentile=30, n_fwd_days=1):
    # Shift X to match factors at t to returns at t+n_fwd_days (we want to predict future returns after all)
    shifted_X = np.roll(X, n_fwd_days+1, axis=0)

    # Slice off rolled elements
    X = shifted_X[n_fwd_days+1:]
    Y = Y[n_fwd_days+1:]
    
    n_time, n_stocks, n_factors = X.shape
    
    # Look for biggest up and down movers
    upper = np.nanpercentile(Y, upper_percentile, axis=1)[:, np.newaxis]
    lower = np.nanpercentile(Y, lower_percentile, axis=1)[:, np.newaxis]
  
    upper_mask = (Y >= upper)
    lower_mask = (Y <= lower)
    
    mask = upper_mask | lower_mask # This also drops nans
    mask = mask.flatten()
    
    # Only try to predict whether a stock moved up/down relative to other stocks
    Y_binary = np.zeros(n_time * n_stocks)
    Y_binary[upper_mask.flatten()] = 1
    Y_binary[lower_mask.flatten()] = -1
    
    # Flatten X
    X = X.reshape((n_time * n_stocks, n_factors))

    # Drop stocks that did not move much (i.e. are in the 30th to 70th percentile)
    X = X[mask]
    Y_binary = Y_binary[mask]
    
    return X, Y_binary

def get_last_values(input_data):
    last_values = []
    for dataset in input_data:
        last_values.append(dataset[-1])
    return np.vstack(last_values).T

# Definition of Machine Learning factor which trains a model and predicts forward returns
class ML(CustomFactor):
    init = False
    params = ('n_fwd_days',)
    
    def compute(self, today, assets, out, returns, *inputs, **params):
        
        n_fwd_days = params['n_fwd_days']
        
        # inputs is a list of factors, for example, assume we have 2 alpha signals, 3 stocks,
        # and a lookback of 2 days. Each element in the inputs list will be data of
        # one signal, so len(inputs) == 2. Then each element will contain a 2-D array
        # of shape [time x stocks]. For example:
        # inputs[0]:
        # [[1, 3, 2], # factor 1 rankings of day t-1 for 3 stocks  
        #  [3, 2, 1]] # factor 1 rankings of day t for 3 stocks
        # inputs[1]:
        # [[2, 3, 1], # factor 2 rankings of day t-1 for 3 stocks
        #  [1, 2, 3]] # factor 2 rankings of day t for 3 stocks

        #if (not self.init) or (today.weekday() == 0): # Monday
        if True:                                       # every day
            # Instantiate sklearn objects
            self.imputer = preprocessing.Imputer()
            self.scaler = preprocessing.MinMaxScaler()
            self.clf = ensemble.AdaBoostClassifier(n_estimators=50)
            #self.clf = ensemble.RandomForestClassifier()
            
            # Stack factor rankings
            X = np.dstack(inputs) # (time, stocks, factors)
            Y = returns # (time, stocks)
        
            # Shift data to match with future returns and binarize 
            # returns based on their 
            X, Y = shift_mask_data(X, Y, n_fwd_days=n_fwd_days)
            
            X = self.imputer.fit_transform(X)            
            X = self.scaler.fit_transform(X)
            
            # Fit the classifier
            self.clf.fit(X, Y)
            #log.debug(self.clf.feature_importances_)
            self.init = True

        # Predict
        # Get most recent factor values (inputs always has the full history)
        last_factor_values = get_last_values(inputs)
        last_factor_values = self.imputer.transform(last_factor_values)
        last_factor_values = self.scaler.transform(last_factor_values)

        # Predict the probability for each stock going up 
        # (column 2 of the output of .predict_proba()) and
        # return it via assignment to out.
        out[:] = self.clf.predict_proba(last_factor_values)[:, 1]
         
def make_ml_factor(mask, factors, window_length, n_fwd_days):
    factors_pipe = OrderedDict()
    # Create returns over last n days.
    factors_pipe['Returns'] = Returns(inputs=[USEquityPricing.open],
                                      mask=mask, window_length=n_fwd_days + 1)
    
    # Instantiate ranked factors
    for name, f in factors.iteritems():
        factors_pipe[name] = f(mask=mask).rank(mask=mask)
        
    # Create our ML pipeline factor. The window_length will control how much
    # lookback the passed in data will have.
    ml = ML(inputs=factors_pipe.values(),
            window_length=window_length + 1,
            mask=mask,
            n_fwd_days=n_fwd_days)
    return ml

Define settings

In [7]:
factor_name = 'factor'

start_date  = '2010-06-01'
end_date    = '2014-06-01'

top_liquid  =  600
show_sector_plots = False # very slow to load the sector column in pipeline

# alphalens specific
periods = (1, 3)
quantiles = None
bins      = 5
avgretplot  = None # (1, 5)
filter_zscore = None
long_short  = True

prices_cache = None # this saves lots of time when running tear sheet multiple times

Performance of a very predictive factor

In [4]:
from quantopian.pipeline.data.alpha_vertex import precog_top_100, precog_top_500

def precog(mask):
    return Latest(inputs=[precog_top_500.predicted_five_day_log_return], mask=mask)
In [5]:
prices_cache = \
run_tear_sheet( factor       = precog,
                factor_name  = factor_name,
                start_date   = start_date,
                end_date     = end_date,
                top_liquid   = top_liquid,
                show_sector_plots = show_sector_plots,
                avgretplot   = avgretplot,               
                periods      = periods,
                quantiles    = quantiles,
                bins         = bins,
                filter_zscore = filter_zscore,
                long_short   = long_short,
                prices_cache = prices_cache)
Get pricing for 889 entries
Alphalens
Dropping inf or -inf values from factor
Quantiles Statistics
min max mean std count count %
factor_quantile
1 -1.602 0.033 -0.034243 0.036969 92261 20.781470
2 -0.140 0.051 -0.009856 0.017530 90499 20.384586
3 -0.069 0.071 0.001158 0.016782 88336 19.897378
4 -0.050 0.106 0.012298 0.018441 86986 19.593295
5 -0.034 1.258 0.037551 0.038393 85876 19.343271
Returns Analysis
1 3
Ann. alpha 0.154 0.096
beta 0.003 0.004
Mean Period Wise Return Top Quantile (bps) 6.071 11.251
Mean Period Wise Return Bottom Quantile (bps) -7.196 -12.960
Mean Period Wise Spread (bps) 13.341 8.151
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=3).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=3).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:519: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=22,center=False).mean()
  pd.rolling_mean(mean_returns_spread_bps, 22).plot(color='orangered',
<matplotlib.figure.Figure at 0x7f91cad38290>