Notebook
In [1]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline import factors, filters, classifiers
from quantopian.pipeline.filters import  StaticAssets
from quantopian.pipeline.factors import CustomFactor, Returns, AverageDollarVolume, Latest
from quantopian.pipeline.factors import SimpleMovingAverage, RollingLinearRegressionOfReturns
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.classifiers.morningstar import Sector  
from quantopian.pipeline.filters.morningstar import Q1500US, Q500US

import math
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
MORNINGSTAR_SECTOR_CODES = {
     -1: 'Misc',
    101: 'Basic Materials',
    102: 'Consumer Cyclical',
    103: 'Financial Services',
    104: 'Real Estate',
    205: 'Consumer Defensive',
    206: 'Healthcare',
    207: 'Utilities',
    308: 'Communication Services',
    309: 'Energy',
    310: 'Industrials',
    311: 'Technology' ,    
}

Alphalens boilerplate

In [3]:
def high_volume_universe(top_liquid, min_price = None, min_volume = None):  
    """
    Computes a security universe of liquid stocks and filtering out
    hard to trade ones
    Returns
    -------
    high_volume_tradable - zipline.pipeline.filter
    """
    
    full_filter = filters.make_us_equity_universe(
        target_size=top_liquid,
        rankby=factors.AverageDollarVolume(window_length=200),
        mask=filters.default_us_equity_universe_mask(),
        groupby=Sector(),
        max_group_weight=0.3,
        smoothing_func=lambda f: f.downsample('month_start'),
    )
    
    if min_price is not None:
        price = SimpleMovingAverage(inputs=[USEquityPricing.close],
                                    window_length=21, mask=full_filter)
        full_filter &= (price >= min_price)
        
    if min_volume is not None:
        volume = SimpleMovingAverage(inputs=[USEquityPricing.volume],
                                     window_length=21, mask=full_filter)
        full_filter &= (volume >= min_volume)
        
    return full_filter

       
def construct_factor_history(factor_cls, start_date='2015-10-1', end_date='2016-2-1', 
                             factor_name='factor', top_liquid=500,
                             sector_column=None):
    """
    Creates a DataFrame containing daily factor values and sector codes for a liquidity 
    constrained universe. The returned DataFrame is can be used in the factor tear sheet.
    """
    if top_liquid == 500:
        ok_universe = Q500US()
    elif top_liquid == 1500:
        ok_universe = Q1500US()
    else:
        ok_universe = high_volume_universe(top_liquid)
       
    factor = factor_cls(mask=ok_universe)
    sector = Sector(mask=ok_universe)    
       
    pipe = Pipeline()
    pipe.add(factor, factor_name)
    if sector_column is not None: # this is very slow too
        pipe.add(sector, sector_column)  
    pipe.set_screen(ok_universe)

    daily_factor = run_pipeline(pipe, start_date=start_date, end_date=end_date, chunksize=250)
       
    return daily_factor.dropna()

def get_daily_price(sid_universe, start_date, end_date, extra_days_before=0, extra_days_after=0):
    """
    Creates a DataFrame containing daily percentage returns and price
    """   
    extra_days = math.ceil(extra_days_before * 365.0/252.0) + 3 # just to be sure
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") - datetime.timedelta(days=extra_days)
    start_date = start_date.strftime("%Y-%m-%d")
    
    extra_days = math.ceil(extra_days_after * 365.0/252.0) + 3 # just to be sure
    end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=extra_days)
    end_date = end_date.strftime("%Y-%m-%d")
    
    pricing = get_pricing(sid_universe, start_date=start_date, end_date=end_date, fields='open_price')
    
    return pricing

run_tear_sheet glue all the functions together to make life easier to run the tear sheet on a pipeline factor

In [4]:
import alphalens
import alphalens.performance as perf 
import alphalens.utils as utils

def run_tear_sheet(factor,
                   factor_name,
                   start_date,
                   end_date,
                   top_liquid,
                   show_sector_plots,
                   avgretplot,
                   periods,
                   quantiles,
                   bins,
                   filter_zscore,
                   long_short,
                   prices_cache = None):
     
    sector_column = 'sector_code' if show_sector_plots else None
    days_before, days_after = (0,0)

    if avgretplot is not None:   
        days_before, days_after = avgretplot
        days_after = max(days_after, max(periods) + 1)
    
    #
    ## Run the Pipeline
    #
    print 'construct factor history'
    factor = construct_factor_history(factor, start_date=start_date, end_date=end_date, 
                                      factor_name=factor_name, top_liquid=top_liquid,
                                      sector_column=sector_column)
    #
    ## Get prices
    #
    sid_universe = set( factor.index.levels[1].unique() )
    if prices_cache is not None:
        cached_sids = set(prices_cache.columns)
        sid_universe -= cached_sids
        
    print 'Get pricing for %d entries' % len(sid_universe)
    if sid_universe:
        prices = get_daily_price(sid_universe, start_date=start_date, end_date=end_date, 
                                 extra_days_before=days_before, extra_days_after=days_after)
        if prices_cache is not None:
            prices = pd.concat([prices, prices_cache], axis=1)
    else:
        prices = prices_cache

    #
    ## Use Alphalens to create a factor tear sheet
    #
    print 'Alphalens'
    
    if len(np.isinf(factor[factor_name])) > 0:
        print 'Dropping inf or -inf values from factor'
        factor[factor_name] = factor[factor_name].replace([np.inf, -np.inf], np.nan)
    
    sectors_series = factor[sector_column] if show_sector_plots else None
    factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor=factor[factor_name],
                                                                       prices=prices,
                                                                       groupby=sectors_series,
                                                                       by_group=False,
                                                                       quantiles=quantiles,
                                                                       bins=bins,
                                                                       periods=periods,
                                                                       filter_zscore=filter_zscore,
                                                                       groupby_labels=MORNINGSTAR_SECTOR_CODES)

    if avgretplot:
        alphalens.tears.create_event_returns_tear_sheet(factor_data=factor_data,
                                                        prices=prices,
                                                        avgretplot=avgretplot,
                                                        long_short=long_short,
                                                        by_group=show_sector_plots)

    #alphalens.tears.create_full_tear_sheet(factor_data=factor_data,
    #                                       long_short=long_short,
    #                                       group_adjust=False,
    #                                       by_group=show_sector_plots)
    alphalens.plotting.plot_quantile_statistics_table(factor_data)
    alphalens.tears.create_returns_tear_sheet(factor_data=factor_data,
                                              long_short=long_short,
                                              by_group=show_sector_plots)

    
    return prices

ML factor, the machine learning factor we want to test

In [5]:
#
# ML factor
#

from sklearn import linear_model, decomposition, ensemble, preprocessing, isotonic, metrics
from collections import OrderedDict
from scipy import stats

def shift_mask_data(X, Y, upper_percentile=70, lower_percentile=30, n_fwd_days=1):
    # Shift X to match factors at t to returns at t+n_fwd_days (we want to predict future returns after all)
    #print "preroll:shape(X)=", X.shape, " preroll:shape(Y)=", Y.shape

    shifted_X = np.roll(X, n_fwd_days+1, axis=0)

    # Slice off rolled elements
    X = shifted_X[n_fwd_days+1:]
    Y = Y[n_fwd_days+1:]
    #print "postroll:shape(X)=", X.shape, " postroll:shape(Y)=", Y.shape
    
    n_time, n_stocks, n_factors = X.shape
    
    # Look for biggest up and down movers
    upper = np.nanpercentile(Y, upper_percentile, axis=1)[:, np.newaxis]
    lower = np.nanpercentile(Y, lower_percentile, axis=1)[:, np.newaxis]
  
    upper_mask = (Y >= upper)
    lower_mask = (Y <= lower)
    
    mask = upper_mask | lower_mask # This also drops nans
    mask = mask.flatten()
    
    # Only try to predict whether a stock moved up/down relative to other stocks
    Y_binary = np.zeros(n_time * n_stocks)
    Y_binary[upper_mask.flatten()] = 1
    Y_binary[lower_mask.flatten()] = -1
    
    # Flatten X
    X = X.reshape((n_time * n_stocks, n_factors))

    # Drop stocks that did not move much (i.e. are in the 30th to 70th percentile)
    X = X[mask]
    Y_binary = Y_binary[mask]
    #print "postret:shape(X)=", X.shape, " postret:shape(Y)=", Y_binary.shape

    return X, Y_binary

def get_last_values(input_data):
    last_values = []
    for dataset in input_data:
        last_values.append(dataset[-1])
    return np.vstack(last_values).T

# Definition of Machine Learning factor which trains a model and predicts forward returns
class ML(CustomFactor):
    init = False
    params = ('n_fwd_days',)
    
    def compute(self, today, assets, out, returns, *inputs, **params):
        
        n_fwd_days = params['n_fwd_days']
        
        # inputs is a list of factors, for example, assume we have 2 alpha signals, 3 stocks,
        # and a lookback of 2 days. Each element in the inputs list will be data of
        # one signal, so len(inputs) == 2. Then each element will contain a 2-D array
        # of shape [time x stocks]. For example:
        # inputs[0]:
        # [[1, 3, 2], # factor 1 rankings of day t-1 for 3 stocks  
        #  [3, 2, 1]] # factor 1 rankings of day t for 3 stocks
        # inputs[1]:
        # [[2, 3, 1], # factor 2 rankings of day t-1 for 3 stocks
        #  [1, 2, 3]] # factor 2 rankings of day t for 3 stocks

        #if (not self.init) or (today.weekday() == 0): # Monday
        if True:                                       # every day
            # Instantiate sklearn objects
            self.imputer = preprocessing.Imputer()
            self.scaler = preprocessing.MinMaxScaler()
            self.clf = ensemble.AdaBoostClassifier(n_estimators=50)
            #self.clf = ensemble.RandomForestClassifier()
            
            # Stack factor rankings
            X = np.dstack(inputs) # (time, stocks, factors)
            Y = returns # (time, stocks)
        
            # Shift data to match with future returns and binarize 
            # returns based on their 
            X, Y = shift_mask_data(X, Y, n_fwd_days=n_fwd_days)
            
            X = self.imputer.fit_transform(X)            
            X = self.scaler.fit_transform(X)
            
            # Fit the classifier
            self.clf.fit(X, Y)
            #log.debug(self.clf.feature_importances_)
            self.init = True

        # Predict
        # Get most recent factor values (inputs always has the full history)
        last_factor_values = get_last_values(inputs)
        last_factor_values = self.imputer.transform(last_factor_values)
        last_factor_values = self.scaler.transform(last_factor_values)

        # Predict the probability for each stock going up 
        # (column 2 of the output of .predict_proba()) and
        # return it via assignment to out.
        
        #ajjc: Changed to "minus" to adjust for right direction of cumulative returns.
        out[:] = -self.clf.predict_proba(last_factor_values)[:, 1]
         
def make_ml_factor(mask, factors, window_length, n_fwd_days):
    factors_pipe = OrderedDict()
    # Create returns over last n days.
    factors_pipe['Returns'] = Returns(inputs=[USEquityPricing.open],
                                      mask=mask, window_length=n_fwd_days + 1)
    
    # Instantiate ranked factors
    for name, f in factors.iteritems():
        factors_pipe[name] = f(mask=mask).rank(mask=mask)
        
    # Create our ML pipeline factor. The window_length will control how much
    # lookback the passed in data will have.
    ml = ML(inputs=factors_pipe.values(),
            window_length=window_length + 1,
            mask=mask,
            n_fwd_days=n_fwd_days)
    return ml

Define settings

In [6]:
factor_name = 'factor'

#ajjc: Date change
start_date  = '2015-10-01'
end_date    = '2017-10-01'

#start_date  = '2010-06-01'
#end_date    = '2014-06-01'

top_liquid  =  600 #600
show_sector_plots = False # very slow to load the sector column in pipeline

#ajjc: large forward look-ahead. Training size will be (WND_L-FWD_N) x num_valid_assets.
FWD_N = 140
WND_L = 252

# alphalens specific
#periods = (1,3, 6, 10)
periods = (1, 5, 10, int(0.8*FWD_N), FWD_N, int(1.2*FWD_N))
quantiles = None
bins      = 5
avgretplot  = None # (1, 5)
filter_zscore = None
long_short  = True

prices_cache = None # this saves lots of time when running tear sheet multiple times

Performance of a very predictive factor

In [7]:
from quantopian.pipeline.data.alpha_vertex import precog_top_100, precog_top_500

def precog(mask):
    return Latest(inputs=[precog_top_500.predicted_five_day_log_return], mask=mask)
In [8]:
prices_cache = \
run_tear_sheet( factor       = precog,
                factor_name  = factor_name,
                start_date   = start_date,
                end_date     = end_date,
                top_liquid   = top_liquid,
                show_sector_plots = show_sector_plots,
                avgretplot   = avgretplot,               
                periods      = periods,
                quantiles    = quantiles,
                bins         = bins,
                filter_zscore = filter_zscore,
                long_short   = long_short,
                prices_cache = prices_cache)
construct factor history
Get pricing for 768 entries
Alphalens
Dropping inf or -inf values from factor
Quantiles Statistics
min max mean std count count %
factor_quantile
1 -1.135 0.151 -0.027381 0.063370 6739 4.379074
2 -0.613 0.385 -0.010695 0.024747 44049 28.623506
3 -0.484 0.382 0.003195 0.023741 71043 46.164493
4 -0.187 0.837 0.009106 0.033271 28353 18.424079
5 -0.039 1.308 0.051953 0.085573 3707 2.408848
Returns Analysis
1 5 10 112 140 168
Ann. alpha 0.176 0.049 0.001 0.015 0.015 0.028
beta -0.012 0.003 0.035 0.005 -0.003 -0.092
Mean Period Wise Return Top Quantile (bps) 2.677 5.877 19.436 350.858 318.217 300.195
Mean Period Wise Return Bottom Quantile (bps) -5.526 0.695 11.487 27.001 -9.036 -33.608
Mean Period Wise Spread (bps) 17.859 2.342 -0.712 4.753 2.922 2.618
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=5).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=5).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=10).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=10).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=112).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=112).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=140).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=140).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=168).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=168).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:519: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=22,center=False).mean()
  pd.rolling_mean(mean_returns_spread_bps, 22).plot(color='orangered',
<matplotlib.figure.Figure at 0x7fe2ce303190>