Notebook
In [13]:
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import SimpleMovingAverage
from quantopian.pipeline.classifiers.fundamentals import Sector 
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.filters import Q1500US, Q500US
import pandas as pd
import numpy as np
import random as random
from itertools import combinations
from sklearn.cluster import KMeans
def make_pipeline(): 
  
  sector_filter = Sector()
  financial_sector_filter = sector_filter.eq(103)

  market_cap = Fundamentals.market_cap.latest
  
  enterprise_value = Fundamentals.enterprise_value.latest
  
  dps_growth = Fundamentals.dps_growth.latest 

  sustain_growth = Fundamentals.sustainable_growth_rate.latest
  
  working_capital_per_share = Fundamentals.working_capital_per_share.latest 
    
  ROA = Fundamentals.roa.latest
  
  ROE = Fundamentals.roe.latest

  ROIC = Fundamentals.roic.latest

  EV_EBITDA = Fundamentals.ev_to_ebitda.latest

  return Pipeline(
      columns={
          #'EV/EBITDA': EV_EBITDA,
          'enterprise value': enterprise_value,
          'market_cap': market_cap,
          'sustain growth': sustain_growth,
          'ROA' : ROA,
          'ROE' : ROE,
          'ROIC' : ROIC
    }, screen = financial_sector_filter
      
  )
result = run_pipeline(make_pipeline(), '2015-05-05', '2015-05-05')

result = result.dropna(axis=0)
result.head(5)
Out[13]:
ROA ROE ROIC enterprise value market_cap sustain growth
2015-05-05 00:00:00+00:00 Equity(21 [AAME]) 0.003682 0.011316 0.010876 9.892274e+07 8.150474e+07 0.0346
Equity(66 [AB]) 0.028198 0.028258 0.028258 3.247275e+09 3.247275e+09 0.0038
Equity(157 [AEG]) -0.000153 -0.002354 0.000615 2.605216e+10 2.124709e+10 0.0005
Equity(185 [AFL]) 0.005498 0.035853 0.029553 2.930033e+10 2.756033e+10 0.1282
Equity(192 [ATAX]) 0.001303 0.003155 0.007914 5.693063e+08 3.344038e+08 -0.0613
In [14]:
result_array = result.values #change from DataFrame to array to use k-means library 
In [15]:
result_array
Out[15]:
array([[  3.68200000e-03,   1.13160000e-02,   1.08760000e-02,
          9.89227430e+07,   8.15047430e+07,   3.46000000e-02],
       [  2.81980000e-02,   2.82580000e-02,   2.82580000e-02,
          3.24727510e+09,   3.24727510e+09,   3.80000000e-03],
       [ -1.53000000e-04,  -2.35400000e-03,   6.15000000e-04,
          2.60521645e+10,   2.12470910e+10,   5.00000000e-04],
       ..., 
       [  1.49100000e-03,   1.01940000e-02,   9.11800000e-03,
          2.16061994e+10,   1.43571994e+10,   4.21000000e-02],
       [  3.31100000e-03,   4.13979000e-01,   1.10787000e-01,
          6.58138244e+08,   4.31409244e+08,   1.50300000e-01],
       [  8.12000000e-04,   1.68980000e-02,   1.49610000e-02,
         -2.92384211e+10,   7.21025955e+10,   4.96000000e-02]])
In [16]:
kmeans = KMeans(n_clusters=50).fit(result_array) #fit into 50 clusters
cluster_label = kmeans.labels_
print cluster_label
[14  0 37 37 14 27 26 40  4 14  0 14 27  3 13  8 49 10 26 14  0 14 30 40  0
 40  0  0 14 14 40 37  0 40  3  0  0 14 40 26 26 47 14 40 14 40 14 14 14 40
  0  0 14 14 14 14 14  0 14 40 40 40 14 40 44 14 40 40  0 14 40 40 14 14 12
 40  0 14 14 14 40 14 40 14 14 25 14 47 26 47 44 48 14 41 14 49  4  0 14  0
 14 44 40 14 40 14 14 45 14 40  0 40  0 14 47 40 38 14 14 40 26 40 12 31 26
  0 40 40 14 26  0  0  4 14 40 10 29 40  4 14  0  0 40 14 26 40 44 14 26  0
 14 14 40 26  0 40 14 32  0  6  0 40 14 26 40 14  4 14 14 14 14 14 14 14 26
 14 14 14  0 14 14 40 14  0  0 40 14 14 41 14 14 40 41 14 14 40 14 14 14 14
 14 14 14 10 14 14  9 40  0 14 14 14 14  0 12  0 14  0 26  0 26 43 40  0 14
 44 14 14 40 14 14 14 14 40 14 14 40 14 14 14  2 14 14 14 14 40 14 14 14 44
 44 40 40 25 14 14  1 20  0  0 14 14 14 17 14 14 14 14 14 14 14 14 40 14 14
 14  0 40 14  0 14  0  0 14 40 14 14 14 14 14 14 14  0 14 14 14 14 14 40  0
 14 14  0  0 14 43 14 14  0  0  0 14 14  5 41 14 11 14 40 40 14 40 14 14 14
 14 14 14 14 19 44 11 14 40 14 40 14  0 14 14 11 14 40 14 14 40 14 12 14 14
 14 14 14 14 14 14 23 14 14 14 14 14 14 40 26 14 31 14 14 14 35 14  9 47 36
  0 14 14 14 14 14 46 19 14 12 20 40 14 14 40 14 14 14 40 14 14 14 40 14 14
 17 14 41  0 40 14 40 40 40  4 14 21 36 40  0 14 14 14  0 25 14 14 14 40 40
 39 42 14 14  0 40 14 14 14 38  0  0  0 14 40  0 14 14 14 14 14  0 40 14 26
 14 14 14 14 14 14  0 14  0 14 14 14 14 14 14 26 14 14 40 14 14 14 26 14 40
 40 14 14 41 14 14 14 37 14 14 14  0 37 14 14 40 14 14 14 14 14 14 14 14 14
 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 40 14
 24 14 40 14  0 14 14 40 14 47 14 14 14 40 14 34 40  0 14 14 14 40 14  0 40
 14 14 40  0 45 14  4 31 40 14  0 40 14 28 14 47 14 14 14 40 14 14 14 18 14
 40 14 14 40 14 33 14 14 40 14 19 14 12 14  0 14 40 14 40 40 14  0  0  4 14
 14 14 14 16 40  0 14 12 14 14 14 14  0 14 45 14 40 14 14 14 14 14 14 14 14
 14 38 40 22  0 45 14 14 47 14 14 14 14 14 14 14 14  7 12 14 40 40 40 14 14
 14 14 14 14 14 40 14 33 14 40 40  0 15 14 14 14 14 14 47  0 14 12 44 14  2]
In [17]:
cluster = np.array(cluster_label)
cluster = cluster.reshape((-1, 1))
#result_array = np.append(cluster,result_array,axis=1) #append cluster ID to array 
#pd.DataFrame(result_array) #change array back to DataFrame
In [18]:
result['Cluster'] = cluster
result.head()
Out[18]:
ROA ROE ROIC enterprise value market_cap sustain growth Cluster
2015-05-05 00:00:00+00:00 Equity(21 [AAME]) 0.003682 0.011316 0.010876 9.892274e+07 8.150474e+07 0.0346 14
Equity(66 [AB]) 0.028198 0.028258 0.028258 3.247275e+09 3.247275e+09 0.0038 0
Equity(157 [AEG]) -0.000153 -0.002354 0.000615 2.605216e+10 2.124709e+10 0.0005 37
Equity(185 [AFL]) 0.005498 0.035853 0.029553 2.930033e+10 2.756033e+10 0.1282 37
Equity(192 [ATAX]) 0.001303 0.003155 0.007914 5.693063e+08 3.344038e+08 -0.0613 14