 
              In [1]: import numpy as np import pandas as pd import os import warnings import time warnings.simplefilter("ignore") In [2]: from sklearn.svm import SVR from sklearn.linear_model import SGDRegressor, LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingReg ressor from sklearn.metrics import auc_score from sklearn.preprocessing import OneHotEncoder Cause-Effect Pairs http://www.kaggle.com/c/cause-effect-pairs/ Goals: Introduction to the challenge Philosophy behind my solution Rough overview of my solution Possible improvements Understanding The Problem
"Given samples from a pair of variables A, B, find whether A is a cause of B." The objective of the challenge is to rank pairs of variables {A, B} to prioritize experimental verifications of the conjecture that A causes B. This challenge is limited to pairs of variables deprived of their context. For the training data, we are given whether A causes B B cases A A and B are consequences of a common cause, or A and B are indepednent. Error Metric: Bi-directional AUC -> It's a ranking problem. Simplified: Is it more likely that y = f(x, noise) or x = f(y, noise)? In [3]: def bidirectional_auc(true, predictions): return (auc_score(true == 1, predictions) + auc_score(true == -1, - predictions)) / 2 In [4]: from IPython.core.display import Image Image('quiz.png') Out[4]:
Understanding The Data In [5]: folder = "SUP2data_text" # Using SUP2 because SUP1 and SUP3 have no ca tegorical data!!! pairs = pd.read_csv(os.path.join(folder, "CEdata_train_pairs.csv")) target = pd.read_csv(os.path.join(folder, "CEdata_train_target.csv")) publicinfo = pd.read_csv(os.path.join(folder, "CEdata_train_publicinfo. csv")) In [6]: # utility functions! def to_array(pair_str): return np.array(map(int, pair_str.split())) def get_pair(idx, df): return np.array(df['A'])[idx], np.array(df['B'])[idx] def plot_pair(idx, subset=(pairs, target)): df, target = subset x, y = map(to_array, get_pair(idx, df)) print (target.iloc[idx]) plot(x, y, 'o') xlabel('A') ylabel('B') pylab.show() return x, y def type_finder(a_type, b_type): idxs = (a_type == publicinfo['A type']) * (b_type == publicinfo['B type']) > 0 return pairs[idxs], target[idxs]
def with_fit(clf, x, y): x_test = np.linspace(x.min(), x.max(), 1000).reshape(-1, 1) clf.fit(x.reshape(-1, 1), y.flatten()) predictions = clf.predict(x_test) return x_test, predictions def fancy_plot(clf, x, y): plot(x, y, 'o') xlabel('A') ylabel('B') X_to_Y = with_fit(clf, x, y) Y_to_X = with_fit(clf, y, x) p1, = plot(*X_to_Y) p2, = plot(*reversed(Y_to_X)) legend([p1, p2], ["X->Y", "Y->X"]) class PolynomialLinearRegression (object): def __init__(self, degree): self.degree = degree def fit(self, X, y): self.clf = LinearRegression() new_X = np.hstack([X ** n for n in range(1, self.degree + 1)]) self.clf.fit(new_X, y) return self def predict(self, X): new_X = np.hstack([X ** n for n in range(1, self.degree + 1)]) return self.clf.predict(new_X) numerical, categorical, binary = "Numerical", "Categorical", "Binary" types = (numerical, categorical, binary) In [7]: warnings.simplefilter("ignore") pairs.head() Out[7]: SampleID A B 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 -3374 1389 -20322 -12206 0 train1 0 1 1 1 0 1 1 1... -9211 11501 6180 -71... 10105 -3574 -6717 6152 -12820 122 -1222 -4496 1 train2 -6488 1383 6710 8727 1... -3502 -1733 -2409 -363... 9493 -12079 25841 2191 -3421 -9969 -6761 -5880 2 train3 -7618 4012 -1875 -1526... 5276 -5847 -4636 -915... -1679 13870 -9157 -1269 3 2 2 3 2 2 3 2 2 2 2 3 2 3 3 3 train4 -8850 13711 -1438 -92... 3 3 2 3 2 2 3 2... 7810 -5188 -10806 11361 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 4 train5 2020 7001 5679 4429 -... 1 1 0 0 0 0 0 0...
4 train5 2020 7001 5679 4429 -... 1 1 0 0 0 0 0 0... In [8]: target.head() Out[8]: SampleID Target [1 for A->B; Details [1 for A->B; 2 for A<- -1 otherwise] B; 3 for A-B; 4 for A|B] 0 train1 -1 2 1 train2 -1 2 2 train3 0 4 3 train4 -1 2 4 train5 -1 2 In [9]: publicinfo.head() Out[9]: SampleID A type B type 0 train1 Binary Numerical 1 train2 Numerical Numerical 2 train3 Numerical Numerical 3 train4 Numerical Categorical 4 train5 Numerical Binary In [10]: publicinfo['A type'].value_counts() Out[10]: Numerical 4511 Categorical 743 Binary 735 dtype: int64 In [11]: publicinfo['B type'].value_counts() Out[11]: Numerical 4480 Binary 784 Categorical 725 dtype: int64 In [12]: CC = type_finder(categorical, categorical) NC = type_finder(numerical, categorical)
CN = type_finder(categorical, numerical) NN = type_finder(numerical, numerical) In [13]: type_matrix = np.array([[len(type_finder(t1, t2)[0]) for t1 in types] f or t2 in types]) pd.DataFrame(type_matrix, index=types, columns=types) Out[13]: Numerical Categorical Binary Numerical 3372 558 550 Categorical 550 92 83 Binary 589 93 102 In [14]: for i in range(6): plot_pair(i, NN) SampleID train2 Target [1 for A->B; -1 otherwise] -1 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 2 Name: 1, dtype: object SampleID train3 Target [1 for A->B; -1 otherwise] 0 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 4 Name: 2, dtype: object
SampleID train6 Target [1 for A->B; -1 otherwise] 1 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 1 Name: 5, dtype: object SampleID train7 Target [1 for A->B; -1 otherwise] 0 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 4 Name: 6, dtype: object SampleID train9
SampleID train9 Target [1 for A->B; -1 otherwise] -1 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 2 Name: 8, dtype: object SampleID train12 Target [1 for A->B; -1 otherwise] 1 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 1 Name: 11, dtype: object Tangent: My Philosophy The Ultimate Goal: Automatic Feature Creation The data scientist is often the bottleneck Reproducibility
Applicability to a wider range of problems It was also a great fit for the problem (since the variables don't have context). The results: No visualization. No hand-tuned features. No human-driven feedback loop. No state of the art features. Doing the above things would probably improve the score. Feature Creation Problem: How do we get features? We have a matrix of pairs of non-uniform length! We want a 2-D matrix to stand on the shoulders of the existing ML algorithms. How do we not lose local information? In [15]: x, y = plot_pair(0, NN) # Let's focus on numerical-numerical fancy_plot(LinearRegression(), x, y) SampleID train2 Target [1 for A->B; -1 otherwise] -1 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 2 Name: 1, dtype: object
Solution: Compute metrics between predicted and true. Because we don't want to tune the metrics, let's use all of them! various distance metrics various statistical metrics various machine learning metrics Other solutions: Analyzing the images of the plots. Aggregate statistics: entropy, mean, variance, etc. (I used some of these too!) State of the art features Possible Improvement: Finding which metrics were most effective, and using more of those. Use cross-validated predictions instead of in-sample predictions. What we're at: for metric in METRICS: B_to_A = metric(true_A, fit_A) A_to_B = metric(true_B, fit_B) features.append(B_to_A) features.append(A_to_B) features.append(A_to_B - B_to_A) # Why not? Insert more of these...
Problem: How do we get a fit? Which technique would we use to be most applicable? In [16]: x, y = plot_pair(4, NN) # Note to self: show 0, then 4, then 5 fancy_plot(GradientBoostingRegressor(max_depth=3), x, y) SampleID train9 Target [1 for A->B; -1 otherwise] -1 Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B] 2 Name: 8, dtype: object Solution: Key insight: machine learning is (to some extent) curve fitting! Instead of choosing a technique, let's have the machine learning model pick the good
Recommend
More recommend