Source code for secretflow.ml.boost.homo_boost.tree_core.feature_histogram

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2022 Ant Group Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from dataclasses import dataclass
from typing import Dict, List
from operator import add, sub
import copy
from concurrent.futures import ThreadPoolExecutor

import numpy
import pandas
import numpy as np

from secretflow.utils.errors import InvalidArgumentError


[docs]@dataclass()
class HistogramBag(object):
    """Histogram container

    Attributes:
        histogram: Histogram list calculated by calculate_histogram
        hid: histogram id
        p_hid: parent histogram id
    """

    histogram: List = None
    hid: int = -1
    p_hid: int = -1

[docs]    def binary_op(self, other, func: callable, inplace: bool = False):

        assert isinstance(
            other, HistogramBag
        ), f"Expect HistogramBag but got instance of {type(other)}"
        assert len(self.histogram) == len(
            other
        ), f"Expect two same length factors, but got {len(self.histogram)} and {len(other)}"

        histogram = self.histogram
        new_histogram = None
        if not inplace:
            new_histogram = copy.deepcopy(other)
            histogram = new_histogram.histogram
        for f_idx in range(len(self.histogram)):
            for hist_idx in range(len(self.histogram[f_idx])):
                # grad
                histogram[f_idx][hist_idx][0] = func(
                    self.histogram[f_idx][hist_idx][0], other[f_idx][hist_idx][0]
                )
                # hess
                histogram[f_idx][hist_idx][1] = func(
                    self.histogram[f_idx][hist_idx][1], other[f_idx][hist_idx][1]
                )
                # sample
                histogram[f_idx][hist_idx][2] = func(
                    self.histogram[f_idx][hist_idx][2], other[f_idx][hist_idx][2]
                )

        return self if inplace else new_histogram

    def __add__(self, other):
        return self.binary_op(other, add, inplace=False)

    def __sub__(self, other):
        return self.binary_op(other, sub, inplace=False)

    def __len__(self):
        return len(self.histogram)

    def __getitem__(self, item: int):
        return self.histogram[item]

    def __str__(self):
        return str(self.histogram)

    def __repr__(self):
        return str(self.histogram)


[docs]class FeatureHistogram:
    """Feature Histogram"""

    @staticmethod
    def _cal_histogram_once(
        data_frame,
        bin_split_points,
        valid_features,
        use_missing,
        grad_key,
        hess_key,
        thread_pool,
    ):
        if len(data_frame) == 0:
            f_histogram = FeatureHistogram._generate_empty_histogram(
                bin_split_points, valid_features, 1 if use_missing else 0
            )
        else:
            f_histogram = FeatureHistogram._node_calculate_histogram(
                data_frame,
                bin_split_points=bin_split_points,
                valid_features=valid_features,
                use_missing=use_missing,
                grad_key=grad_key,
                hess_key=hess_key,
                thread_pool=thread_pool,
            )
        return f_histogram

[docs]    @staticmethod
    def calculate_histogram(
        data_frame_list: List[pandas.DataFrame],
        bin_split_points: numpy.ndarray,
        valid_features: Dict = None,
        use_missing: bool = False,
        grad_key: str = "grad",
        hess_key: str = "hess",
        thread_pool: ThreadPoolExecutor = None,
    ):
        """
        Calculate histogram according to G and H
        histogram: [cols,[buckets,[sum_g,sum_h,count]]

        Args:
            data_frame_list: A list of data frame, which contain grad and hess
            bin_split_points: global split point dicts
            valid_features: valid feature names Dict[id:bool]
            use_missing: whether missing value participate in train
            grad_key: unique column name for grad value
            hess_key: unique column name for hess value
        Returns:
            node_histograms:一个List[histogram1, histogram2, ...]
        """
        node_histograms = []
        for data_frame in data_frame_list:
            node_histograms.append(
                FeatureHistogram._cal_histogram_once(
                    data_frame,
                    bin_split_points,
                    valid_features,
                    use_missing,
                    grad_key,
                    hess_key,
                    thread_pool,
                )
            )

        return node_histograms

    @staticmethod
    def _generate_empty_histogram(
        bin_split_points: Dict, valid_features: Dict, missing_bin: int
    ):
        """If data if empty, generate empty histogram
        Args:
            bin_split_points: global bin split points
            valid_features: Dict for valid features
            missing_bin: Num of missing bin
        Returns:
            feature_histogram_template: return empty histogram
        """
        feature_histogram_template = []
        for fid in range(len(bin_split_points)):
            # if is not valid features, skip generating
            if valid_features is not None and valid_features[fid] is False:
                feature_histogram_template.append([])
                continue
            else:
                # [0, 0, 0] -> [grad, hess, sample count]
                feature_histogram_template.append(
                    [[0, 0, 0] for j in range(len(bin_split_points[fid]) + missing_bin)]
                )

        # check feature num
        assert len(feature_histogram_template) == len(
            bin_split_points
        ), "Length of feature_histogram_template and bin_split_points not consistent"

        return feature_histogram_template

    @staticmethod
    def _cal_point_hist(data_slice):
        sum_mat = data_slice.sum(axis=0)
        sum_grad = sum_mat[1]
        sum_hess = sum_mat[2]
        sum_count = data_slice.shape[0]
        return [sum_grad, sum_hess, sum_count]

[docs]    @staticmethod
    def calculate_single_histogram(data: np.ndarray, bin_split_point: np.ndarray):
        f_histogram = []

        for bin_t in bin_split_point:
            f_histogram.append(
                FeatureHistogram._cal_point_hist(data[data[:, 0] < bin_t])
            )

        return f_histogram

    @staticmethod
    def _node_calculate_histogram(
        data_frame: pandas.DataFrame,
        bin_split_points: np.ndarray = None,
        valid_features: Dict = None,
        use_missing: bool = False,
        grad_key="grad",
        hess_key="hess",
        thread_pool=None,
    ):
        """function to calculate histogram on node

        Args:
            data_frame: data frame with grad and hess
            bin_split_points: global bin split point
            valid_features: valid feature names Dict[id:bool]
            use_missing: whether missing value participate in train
            grad_key: unique column name for grad value
            hess_key: unique column name for hess value

        Returns:
            single_histogram: histogram of this node
        """
        if thread_pool is None:
            thread_pool = ThreadPoolExecutor()
        single_histogram = []
        np_data = data_frame.to_numpy()
        if valid_features is None:
            raise InvalidArgumentError("valid can not be None")
        header = data_frame.columns.tolist()
        valid_features_list = [k for k, v in valid_features.items() if v]

        futures = {}
        for fid in range(len(bin_split_points)):
            futures[fid] = thread_pool.submit(
                FeatureHistogram._bin_hist,
                np_data,
                fid,
                grad_key,
                hess_key,
                valid_features_list,
                header,
                bin_split_points,
                use_missing,
            )

        for ret in futures.values():
            single_histogram.append(np.array(ret.result()))
        return single_histogram

    @staticmethod
    def _bin_hist(
        data,
        fid,
        grad_key,
        hess_key,
        valid_features_list,
        header,
        bin_split_points,
        use_missing,
    ):
        if fid in valid_features_list:
            t_data = data[:, [fid, header.index(grad_key), header.index(hess_key)]]
            f_histogram = FeatureHistogram.calculate_single_histogram(
                t_data, bin_split_points[fid]
            )
            if use_missing:
                miss_grad = t_data[1:].sum() - f_histogram[-1][0]
                miss_hess = t_data[2:].sum() - f_histogram[-1][1]
                miss_count = len(t_data) - f_histogram[-1][2]
                f_histogram.append([miss_grad, miss_hess, miss_count])
        else:
            f_histogram = []
        return f_histogram