Source code for vaex.ml.catboost

import base64
import tempfile
import traitlets

import vaex
import vaex.serialize
from . import state
from . import generate

import numpy as np
import catboost


[docs]@vaex.serialize.register @generate.register class CatBoostModel(state.HasState): '''The CatBoost algorithm. This class provides an interface to the CatBoost aloritham. CatBoost is a fast, scalable, high performance Gradient Boosting on Decision Trees library, used for ranking, classification, regression and other machine learning tasks. For more information please visit https://github.com/catboost/catboost Example: >>> import vaex >>> import vaex.ml.catboost >>> df = vaex.datasets.iris() >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] >>> df_train, df_test = df.ml.train_test_split() >>> params = { 'leaf_estimation_method': 'Gradient', 'learning_rate': 0.1, 'max_depth': 3, 'bootstrap_type': 'Bernoulli', 'objective': 'MultiClass', 'eval_metric': 'MultiClass', 'subsample': 0.8, 'random_state': 42, 'verbose': 0} >>> booster = vaex.ml.catboost.CatBoostModel(features=features, target='class_', num_boost_round=100, params=params) >>> booster.fit(df_train) >>> df_train = booster.transform(df_train) >>> df_train.head(3) # sepal_length sepal_width petal_length petal_width class_ catboost_prediction 0 5.4 3 4.5 1.5 1 [0.00615039 0.98024259 0.01360702] 1 4.8 3.4 1.6 0.2 0 [0.99034267 0.00526382 0.0043935 ] 2 6.9 3.1 4.9 1.5 1 [0.00688241 0.95190908 0.04120851] >>> df_test = booster.transform(df_test) >>> df_test.head(3) # sepal_length sepal_width petal_length petal_width class_ catboost_prediction 0 5.9 3 4.2 1.5 1 [0.00464228 0.98883351 0.00652421] 1 6.1 3 4.6 1.4 1 [0.00350424 0.9882139 0.00828186] 2 6.6 2.9 4.6 1.3 1 [0.00325705 0.98891631 0.00782664] ''' snake_name = "catboost_model" features = traitlets.List(traitlets.Unicode(), help='List of features to use when fitting the CatBoostModel.') target = traitlets.Unicode(allow_none=False, help='The name of the target column.') num_boost_round = traitlets.CInt(default_value=None, allow_none=True, help='Number of boosting iterations.') params = traitlets.Dict(help='A dictionary of parameters to be passed on to the CatBoostModel model.') pool_params = traitlets.Dict(default_value={}, help='A dictionary of parameters to be passed to the Pool data object construction') prediction_name = traitlets.Unicode(default_value='catboost_prediction', help='The name of the virtual column housing the predictions.') prediction_type = traitlets.Enum(values=['Probability', 'Class', 'RawFormulaVal'], default_value='Probability', help='The form of the predictions. Can be "RawFormulaVal", "Probability" or "Class".') batch_size = traitlets.CInt(default_value=None, allow_none=True, help='If provided, will train in batches of this size.') batch_weights = traitlets.List(traitlets.Float(), default_value=[], allow_none=True, help='Weights to sum models at the end of training in batches.') evals_result_ = traitlets.List(traitlets.Dict(), default_value=[], help="Evaluation results") ctr_merge_policy = traitlets.Enum(values=['FailIfCtrsIntersects', 'LeaveMostDiversifiedTable', 'IntersectingCountersAverage'], default_value='IntersectingCountersAverage', help="Strategy for summing up models. Only used when training in batches. See the CatBoost documentation for more info.") def __call__(self, *args): data2d = np.stack([np.asarray(arg, np.float64) for arg in args], axis=1) dmatrix = catboost.Pool(data2d, **self.pool_params) return self.booster.predict(dmatrix, prediction_type=self.prediction_type)
[docs] def transform(self, df): '''Transform a DataFrame such that it contains the predictions of the CatBoostModel in form of a virtual column. :param df: A vaex DataFrame. It should have the same columns as the DataFrame used to train the model. :return copy: A shallow copy of the DataFrame that includes the CatBoostModel prediction as a virtual column. :rtype: DataFrame ''' copy = df.copy() lazy_function = copy.add_function('catboost_prediction_function', self, unique=True) expression = lazy_function(*self.features) copy.add_virtual_column(self.prediction_name, expression, unique=False) return copy
[docs] def fit(self, df, evals=None, early_stopping_rounds=None, verbose_eval=None, plot=False, progress=None, **kwargs): '''Fit the CatBoostModel model given a DataFrame. This method accepts all key word arguments for the catboost.train method. :param df: A vaex DataFrame containing the features and target on which to train the model. :param evals: A list of DataFrames to be evaluated during training. This allows user to watch performance on the validation sets. :param int early_stopping_rounds: Activates early stopping. :param bool verbose_eval: Requires at least one item in *evals*. If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage. :param bool plot: if True, display an interactive widget in the Jupyter notebook of how the train and validation sets score on each boosting iteration. :param progress: If True display a progressbar when the training is done in batches. ''' self.pool_params['feature_names'] = self.features if evals is not None: for i, item in enumerate(evals): data = item[self.features].values target_data = item[self.target].to_numpy() evals[i] = catboost.Pool(data=data, label=target_data, **self.pool_params) # This does the actual training/fitting of the catboost model if self.batch_size is None: data = df[self.features].values target_data = df[self.target].to_numpy() dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) model = catboost.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs) self.booster = model self.evals_result_ = [model.evals_result_] self.feature_importances_ = list(model.feature_importances_) else: models = [] # Set up progressbar n_samples = len(df) progressbar = vaex.utils.progressbars(progress, title="fit(catboost)") column_names = self.features + [self.target] iterator = df[column_names].to_pandas_df(chunk_size=self.batch_size) for i1, i2, chunk in iterator: progressbar(i1 / n_samples) data = chunk[self.features].values target_data = chunk[self.target].values dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) model = catboost.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs) self.evals_result_.append(model.evals_result_) models.append(model) progressbar(1.0) # Weights are key when summing models if len(self.batch_weights) == 0: batch_weights = [1/len(models)] * len(models) elif self.batch_weights is not None and len(self.batch_weights) != len(models): raise ValueError("'batch_weights' must be te same length as the number of models.") else: batch_weights = self.batch_weights # Sum the models self.booster = catboost.sum_models(models, weights=batch_weights, ctr_merge_policy=self.ctr_merge_policy)
[docs] def predict(self, df, **kwargs): '''Provided a vaex DataFrame, get an in-memory numpy array with the predictions from the CatBoostModel model. This method accepts the key word arguments of the predict method from catboost. :param df: a vaex DataFrame :returns: A in-memory numpy array containing the CatBoostModel predictions. :rtype: numpy.array ''' data = df[self.features].values dmatrix = catboost.Pool(data, **self.pool_params) return self.booster.predict(dmatrix, prediction_type=self.prediction_type, **kwargs)
def state_get(self): filename = tempfile.mktemp() self.booster.save_model(filename) with open(filename, 'rb') as f: data = f.read() return dict(tree_state=base64.encodebytes(data).decode('ascii'), substate=super(CatBoostModel, self).state_get()) def state_set(self, state, trusted=True): super(CatBoostModel, self).state_set(state['substate']) data = base64.decodebytes(state['tree_state'].encode('ascii')) filename = tempfile.mktemp() with open(filename, 'wb') as f: f.write(data) self.booster = catboost.CatBoost().load_model(fname=filename)