I think I'm having issues getting my vectorizer working within a gridsearch pipeline:
data as panda df x_train:
bathrooms bedrooms price building_id manager_id
10 1.5 3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc
10000 1.0 2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d
100004 1.0 1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01
100007 1.0 1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316
100013 1.0 4 3350 0 98e13ad4b495b9613cef886d79a6291fnumeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']
minMaxScaler fit & transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScalerclass MyScaler(BaseEstimator, TransformerMixin):def __init__(self, cols):self.cols = colsdef fit(self, X, y=None):self.scaler = MinMaxScaler()self.scaler.fit(X[self.cols])return selfdef transform(self, X):return self.scaler.transform(X[self.cols])
My categorical feature hashing vectorizer:
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizerclass MyVectorizer(BaseEstimator, TransformerMixin):"""Vectorize a set of categorical variables"""def __init__(self, cols, hashing=None):"""args:cols: a list of column names of the categorical variableshashing: If None, then vectorization is a simple one-hot-encoding.If an integer, then hashing is the number of features in the output."""self.cols = colsself.hashing = hashingdef fit(self, X, y=None):data = X[self.cols]# Choose a vectorizerif self.hashing is None:self.myvec = HashingVectorizer()else:self.myvec = FeatureHasher(n_features = self.hashing)self.myvec.fit(X[self.cols].to_dict(orient='records'))return selfdef transform(self, X):# Vectorize Inputif self.hashing is None:return pd.DataFrame(self.myvec.transform(X[self.cols].to_dict(orient='records')),columns = self.myvec.feature_names_)else:return pd.DataFrame(self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray())
GridSearch hyperparameters:
search_params = {'preprocess__vectorize__hashing': [20, 40, 80],'predict__alpha': [.01, .1, 1, 2, 10]
}
pipeline:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegressionpipeline = Pipeline([('preprocess', FeatureUnion([('scale', MyScaler(cols=numeric_predictors)),('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))])),('predict', MultinomialNB())
])
And last, calling this with the gridsearchCV classifier:
grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)
I get a ValueError: Input X must be non-negative. I checked and my numeric_predictor columns' data all non-negative, so I am narrowing it down to an issue with the hashing of the categorical predictors.
ValueError Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)3 grid_search.best_params_/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)636 error_score=self.error_score)637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))639 640 # if one choose to see train score, "out" will contain train score info/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)777 # was dispatched. In particular this covers the edge778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):780 self._iterating = True781 else:/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)623 return False624 else:
--> 625 self._dispatch(tasks)626 return True627 /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)586 dispatch_timestamp = time.time()587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)589 self._jobs.append(job)590 /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)109 def apply_async(self, func, callback=None):110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)112 if callback:113 callback(result)/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)330 # Don't delay the application, to avoid keeping the input331 # arguments in memory
--> 332 self.results = batch()333 334 def get(self):/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)129 130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]132 133 def __len__(self):/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)435 estimator.fit(X_train, **fit_params)436 else:
--> 437 estimator.fit(X_train, y_train, **fit_params)438 439 except Exception as e:/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)257 Xt, fit_params = self._fit(X, y, **fit_params)258 if self._final_estimator is not None:
--> 259 self._final_estimator.fit(Xt, y, **fit_params)260 return self261 /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)602 self.feature_count_ = np.zeros((n_effective_classes, n_features),603 dtype=np.float64)
--> 604 self._count(X, Y)605 alpha = self._check_alpha()606 self._update_feature_log_prob(alpha)/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)706 """Count and smooth feature occurrences."""707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")709 self.feature_count_ += safe_sparse_dot(Y.T, X)710 self.class_count_ += Y.sum(axis=0)ValueError: Input X must be non-negative> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()706 """Count and smooth feature occurrences."""707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")709 self.feature_count_ += safe_sparse_dot(Y.T, X)710 self.class_count_ += Y.sum(axis=0)