Source code for secretflow.ml.boost.homo_boost.boost_core.core

# Copyright 2022 Ant Group Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import uuid
from typing import Callable, Dict, List, Union

import numpy
import secretflow.device.link as link
import xgboost.core as xgb_core
from secretflow.data.horizontal import HDataFrame
from secretflow.data.split import train_test_split
from secretflow.ml.boost.homo_boost.homo_decision_tree import HomoDecisionTree
from secretflow.ml.boost.homo_boost.tree_param import TreeParam
from secretflow.utils.errors import InvalidArgumentError


[docs]class FedBooster(xgb_core.Booster):
    """Federated Booster internal
    Internal implementation, it is not recommended for users to call directly! ! !

    Attributes:
        params : Parameters for boosters.
        cache : List of cache items.
        model_file : Path to the model file if it's string or PathLike.
    """

[docs]    def __init__(
        self,
        params: Dict = None,
        cache: List = (),
        model_file: Union[str, os.PathLike, xgb_core.Booster, bytearray] = None,
    ):
        self.model_path = f"./{link.get_device()}_{uuid.uuid1()}.json"
        if 'hess_key' in params:
            self.hess_key = params.pop("hess_key")
        else:
            raise InvalidArgumentError("hess_key must be assignd!")
        if 'grad_key' in params:
            self.grad_key = params.pop("grad_key")
        else:
            raise InvalidArgumentError("grad_key must be assignd")
        if 'label_key' in params:
            self.label_key = params.pop("label_key")
        else:
            raise InvalidArgumentError("label_key must be assignd")
        self.role = link.get_role()
        super(FedBooster, self).__init__(
            params=params, cache=cache, model_file=model_file
        )
        self.save_model(self.model_path)

[docs]    def federate_update(
        self,
        params: Dict,
        dtrain: xgb_core.DMatrix,
        hdata: HDataFrame,
        bin_split_points: List,
        iter_round: int = None,
        fobj: Callable = None,
    ):
        """
        federated update function, a variant in xgboost update
        Args:
            params: Training params dict
            dtrain: Training data in dmatrix format
            hdata: Training data in HdataFrame format
            bin_split_points: Global split point
            iter_round: Iteration rounds
            fobj: Custom evaluation function

        """
        if not isinstance(dtrain, xgb_core.DMatrix):
            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
        self._validate_features(dtrain)
        # Create tree_params
        tree_param = TreeParam(
            max_depth=params['max_depth'] if 'max_depth' in params else 3,
            eta=params['eta'] if 'eta' in params else 0.3,
            objective=params['objective'],
            verbosity=params['verbosity'] if 'verbosity' in params else 0,
            tree_method=params['tree_method'] if 'tree_method' in params else 'hist',
            reg_lambda=params['lambda'] if 'lambda' in params else 0.1,
            reg_alpha=params['alpha'] if 'alpha' in params else 0.0,
            gamma=params['gamma'] if 'gamma' in params else 1e-4,
            colsample_bytree=params['colsample_bytree']
            if 'colsample_bytree' in params
            else 1.0,
            colsample_byleval=params['colsample_bylevel']
            if 'colsample_bylevel' in params
            else 1.0,
            base_score=params['base_score'] if 'base_score' in params else 0.5,
            random_state=params['random_state'] if 'random_state' in params else 1234,
            num_parallel=params['n_thread'] if 'n_thread' in params else None,
            subsample=params['subsample'] if 'subsample' in params else 1.0,
            decimal=params['decimal'] if 'decimal' in params else 10,
            num_class=params['num_class'] if 'num_class' in params else 0,
        )
        # sample by row
        if tree_param.subsample < 1.0:
            train_data, _ = train_test_split(
                hdata, ratio=tree_param.subsample, random_state=tree_param.random_state
            )
        else:
            train_data = hdata

        pred = self.predict(dtrain, output_margin=True, training=True)
        grad, hess = fobj(pred, dtrain)
        group_num = numpy.expand_dims(pred, axis=-1).shape[1]

        # single thread
        if group_num > 2:
            assert params['objective'] in [
                "multi:softmax",
                "multi:softprob",
            ], "Use only 'multi:softmax' for multi-category tasks"
            assert (
                group_num == params['num_class']
            ), "group_num and num_class not aligned"
        for group_id in range(group_num):
            if group_num > 2:
                hdata[self.grad_key], hdata[self.hess_key] = (
                    grad[:, group_id],
                    hess[:, group_id],
                )
            else:
                hdata[self.grad_key], hdata[self.hess_key] = grad, hess

            tree_id = iter_round * group_num + group_id
            decision_tree = HomoDecisionTree(
                tree_param=tree_param,
                data=train_data,
                bin_split_points=bin_split_points,
                group_id=group_id,
                tree_id=tree_id,
                iter_round=iter_round,
                hess_key=self.hess_key,
                grad_key=self.grad_key,
                label_key=self.label_key,
            )
            decision_tree.fit()

            if self.role == link.CLIENT:
                if tree_id == 0:
                    decision_tree.init_xgboost_model(self.model_path)
                decision_tree.save_xgboost_model(
                    self.model_path, decision_tree.tree_node
                )
        logging.info(f"fit for iter_round={iter_round} done")
        if self.role == link.CLIENT:
            self.load_model(self.model_path)

[docs]    def save_model(self, fname: Union[str, os.PathLike]):
        """Save the model to a file.

        Attributes:
            fname : string or os.PathLike, model path, if the suffix is json, store the model in json format

        """
        if isinstance(fname, (str, os.PathLike)):  # assume file name
            fname = os.fspath(os.path.expanduser(fname))
            xgb_core._check_call(
                xgb_core._LIB.XGBoosterSaveModel(self.handle, xgb_core.c_str(fname))
            )
        else:
            raise TypeError("fname must be a string or os PathLike")