Source code for secretflow.stats.table_statistics

# Copyright 2022 Ant Group Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
from secretflow.data.vertical import VDataFrame
from typing import Union


[docs]def table_statistics(table: Union[pd.DataFrame, VDataFrame]) -> pd.DataFrame: """Get table statistics for a pd.DataFrame or VDataFrame. Args: table: Union[pd.DataFrame, VDataFrame] Returns: table_statistics: pd.DataFrame including each column's datatype, total_count, count, count_na, min, max, var, std, sem, skewness, kurtosis, q1, q2, q3, moment_2, moment_3, moment_4, central_moment_2, central_moment_3, central_moment_4, sum, sum_2, sum_3 and sum_4. moment_2 means E[X^2]. central_moment_2 means E[(X - mean(X))^2]. sum_2 means sum(X^2). """ assert isinstance( table, (pd.DataFrame, VDataFrame) ), "table must be a pd.DataFrame or VDataFrame" index = table.columns result = pd.DataFrame(index=index) result['datatype'] = table.dtypes result['total_count'] = table.shape[0] result['count'] = table.count() result['count_na'] = table.isna().sum() result['min'] = table.min(numeric_only=True) result['max'] = table.max(numeric_only=True) result['mean'] = table.mean(numeric_only=True) result['var'] = table.var(numeric_only=True) result['std'] = table.std(numeric_only=True) result['sem'] = table.sem(numeric_only=True) result['skew'] = table.skew(numeric_only=True) result['kurtosis'] = table.kurtosis(numeric_only=True) result['q1'] = table.quantile(0.25) result['q2'] = table.quantile(0.5) result['q3'] = table.quantile(0.75) result['moment_2'] = table.select_dtypes('number').pow(2).mean(numeric_only=True) result['moment_3'] = table.select_dtypes('number').pow(3).mean(numeric_only=True) result['moment_4'] = table.select_dtypes('number').pow(4).mean(numeric_only=True) result['central_moment_2'] = ( table.subtract(result['mean']) .select_dtypes('number') .pow(2) .mean(numeric_only=True) ) result['central_moment_3'] = ( table.subtract(result['mean']) .select_dtypes('number') .pow(3) .mean(numeric_only=True) ) result['central_moment_4'] = ( table.subtract(result['mean']) .select_dtypes('number') .pow(4) .mean(numeric_only=True) ) result['sum'] = table.sum(numeric_only=True) result['sum_2'] = table.select_dtypes('number').pow(2).sum(numeric_only=True) result['sum_3'] = table.select_dtypes('number').pow(3).sum(numeric_only=True) result['sum_4'] = table.select_dtypes('number').pow(4).sum(numeric_only=True) return result