Source code for opensearch_py_ml.groupby

# SPDX-License-Identifier: Apache-2.0
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.


#  Licensed to Elasticsearch B.V. under one or more contributor
#  license agreements. See the NOTICE file distributed with
#  this work for additional information regarding copyright
#  ownership. Elasticsearch B.V. licenses this file to you under
#  the Apache License, Version 2.0 (the "License"); you may
#  not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
# 	http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

from typing import TYPE_CHECKING, List, Optional, Union

from opensearch_py_ml.query_compiler import QueryCompiler

if TYPE_CHECKING:
    import pandas as pd  # type: ignore


class GroupBy:
    """
    Base class for calls to :py:func:`opensearch_py_ml.DataFrame.groupby`
    """

    def __init__(
        self,
        by: List[str],
        query_compiler: "QueryCompiler",
        dropna: bool = True,
    ) -> None:
        self._query_compiler: "QueryCompiler" = QueryCompiler(to_copy=query_compiler)
        self._dropna: bool = dropna
        self._by: List[str] = by


[docs] class DataFrameGroupBy(GroupBy): """ This holds all the groupby methods for :py:func:`opensearch_py_ml.DataFrame.groupby` """
[docs] def mean(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the mean value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame mean value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.mean` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP AvgTicketPrice Cancelled dayOfWeek timestamp DestCountry AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443 AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627 AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094 AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287 CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150 ... ... ... ... ... RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104 SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838 TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000 US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215 ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773 <BLANKLINE> [32 rows x 4 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["mean"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def var(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the variance value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame variance value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.var` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek DestCountry AE 75789.979090 0.130443 3.950549 AR 59683.055316 0.125979 3.783429 AT 65726.669676 0.144610 4.090013 AU 65088.483446 0.113094 3.833562 CA 68149.950516 0.116496 3.688139 ... ... ... ... RU 67305.277617 0.114107 3.852666 SE 53740.570338 0.127062 3.942132 TR 61245.521047 0.094868 4.100420 US 74349.939410 0.109638 3.758700 ZA 62920.072901 0.126608 3.775609 <BLANKLINE> [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["var"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def std(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the standard deviation value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame standard deviation value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.std` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek DestCountry AE 279.875500 0.367171 2.020634 AR 244.903626 0.355811 1.949901 AT 256.883342 0.381035 2.026411 AU 255.585377 0.336902 1.961486 CA 261.263054 0.341587 1.921980 ... ... ... ... RU 259.696213 0.338140 1.964815 SE 232.504297 0.357510 1.991340 TR 267.827572 0.333333 2.191454 US 272.774819 0.331242 1.939469 ZA 251.505568 0.356766 1.948258 <BLANKLINE> [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["std"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def mad(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the median absolute deviation value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame median absolute deviation value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.mad` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").mad() # doctest: +SKIP AvgTicketPrice Cancelled dayOfWeek DestCountry AE 233.697174 NaN 1.5 AR 189.250061 NaN 2.0 AT 195.823669 NaN 2.0 AU 202.539764 NaN 2.0 CA 203.344696 NaN 2.0 ... ... ... ... RU 206.431702 NaN 2.0 SE 178.658447 NaN 2.0 TR 221.863434 NaN 1.0 US 228.461365 NaN 2.0 ZA 192.162842 NaN 2.0 <BLANKLINE> [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["mad"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def median(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the median value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame median absolute deviation value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.median` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP AvgTicketPrice Cancelled dayOfWeek timestamp DestCountry AE 585.720490 False 2 2018-01-19 23:56:44.000 AR 678.447433 False 3 2018-01-22 10:18:50.000 AT 659.715592 False 3 2018-01-20 20:40:10.000 AU 689.241348 False 3 2018-01-22 18:46:11.000 CA 663.516057 False 3 2018-01-22 21:35:09.500 ... ... ... ... ... RU 670.714956 False 3 2018-01-20 16:48:16.000 SE 680.111084 False 3 2018-01-22 20:53:44.000 TR 441.681122 False 1 2018-01-13 23:17:27.000 US 600.591525 False 3 2018-01-22 04:09:50.000 ZA 633.935425 False 3 2018-01-23 17:42:57.000 <BLANKLINE> [32 rows x 4 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["median"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def sum(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the sum value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame sum value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.sum` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek DestCountry AE 2.783612e+04 7.0 124.0 AR 2.058223e+05 45.0 837.0 AT 2.437872e+05 66.0 1083.0 AU 2.785365e+05 54.0 1183.0 CA 6.124173e+05 127.0 2786.0 ... ... ... ... RU 4.899533e+05 97.0 2093.0 SE 1.684563e+05 38.0 684.0 TR 4.852532e+03 1.0 19.0 US 1.183804e+06 249.0 5472.0 ZA 1.819840e+05 42.0 783.0 <BLANKLINE> [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["sum"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def min(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the min value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame min value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.min` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek timestamp DestCountry AE 110.799911 False 0 2018-01-01 19:31:30 AR 125.589394 False 0 2018-01-01 01:30:47 AT 100.020531 False 0 2018-01-01 05:24:19 AU 102.294312 False 0 2018-01-01 00:00:00 CA 100.557251 False 0 2018-01-01 00:44:08 ... ... ... ... ... RU 101.004005 False 0 2018-01-01 01:01:51 SE 102.877190 False 0 2018-01-01 04:09:38 TR 142.876465 False 0 2018-01-01 06:45:17 US 100.145966 False 0 2018-01-01 00:06:27 ZA 102.002663 False 0 2018-01-01 06:44:44 <BLANKLINE> [32 rows x 4 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["min"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def max(self, numeric_only: bool = True) -> "pd.DataFrame": """ Compute the max value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame max value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.max` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek timestamp DestCountry AE 1126.148682 True 6 2018-02-11 04:11:14 AR 1199.642822 True 6 2018-02-11 17:09:05 AT 1181.835815 True 6 2018-02-11 23:12:33 AU 1197.632690 True 6 2018-02-11 21:39:01 CA 1198.852539 True 6 2018-02-11 23:04:08 ... ... ... ... ... RU 1196.742310 True 6 2018-02-11 20:03:31 SE 1198.621582 True 6 2018-02-11 22:06:14 TR 855.935547 True 6 2018-02-04 01:59:23 US 1199.729004 True 6 2018-02-11 23:27:00 ZA 1196.186157 True 6 2018-02-11 23:29:45 <BLANKLINE> [32 rows x 4 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["max"], dropna=self._dropna, numeric_only=numeric_only, )
[docs] def nunique(self) -> "pd.DataFrame": """ Compute the nunique value for each group. Parameters ---------- numeric_only: {True, False, None} Default is True Which datatype to be returned - True: Returns all values as float64, NaN/NaT values are removed - None: Returns all values as the same dtype where possible, NaN/NaT are removed - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.DataFrame nunique value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.nunique` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek DestCountry AE 46 2 7 AR 305 2 7 AT 377 2 7 AU 416 2 7 CA 944 2 7 ... ... ... ... RU 739 2 7 SE 255 2 7 TR 10 2 5 US 1987 2 7 ZA 283 2 7 <BLANKLINE> [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["nunique"], dropna=self._dropna, numeric_only=False, )
[docs] def quantile( self, q: Union[int, float, List[int], List[float]] = 0.5 ) -> "pd.DataFrame": """ Used to groupby and calculate quantile for a given DataFrame. Parameters ---------- q: float or array like, default 0.5 Value between 0 <= q <= 1, the quantile(s) to compute. Returns ------- pandas.DataFrame quantile value for each grouped column See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.quantile` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> oml_df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> oml_flights = oml_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"]) >>> oml_flights.groupby(["dayOfWeek", "Cancelled"]).quantile() # doctest: +SKIP AvgTicketPrice FlightDelayMin dayOfWeek Cancelled 0 False 572.290384 0.0 True 578.140564 0.0 1 False 567.980560 0.0 True 582.618713 0.0 2 False 590.170986 0.0 True 579.811890 0.0 3 False 574.131340 0.0 True 572.852264 0.0 4 False 591.533699 0.0 True 582.877014 0.0 5 False 791.622625 0.0 True 793.362946 0.0 6 False 817.378523 0.0 True 766.855530 0.0 >>> oml_flights.groupby(["dayOfWeek", "Cancelled"]).quantile(q=[.2, .5]) # doctest: +SKIP AvgTicketPrice FlightDelayMin dayOfWeek Cancelled 0 False 0.2 319.925979 0.0 0.5 572.290384 0.0 True 0.2 325.704562 0.0 0.5 578.140564 0.0 1 False 0.2 327.311007 0.0 0.5 567.980560 0.0 True 0.2 336.839572 0.0 0.5 582.618713 0.0 2 False 0.2 332.323011 0.0 0.5 590.170986 0.0 True 0.2 314.472537 0.0 0.5 579.811890 0.0 3 False 0.2 327.652659 0.0 0.5 574.131340 0.0 True 0.2 298.483032 0.0 0.5 572.852264 0.0 4 False 0.2 314.290205 0.0 0.5 591.533699 0.0 True 0.2 325.024850 0.0 0.5 582.877014 0.0 5 False 0.2 567.362137 0.0 0.5 791.622625 0.0 True 0.2 568.323944 0.0 0.5 793.362946 0.0 6 False 0.2 568.489746 0.0 0.5 817.378523 0.0 True 0.2 523.890680 0.0 0.5 766.855530 0.0 """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["quantile"], quantiles=q, numeric_only=True )
[docs] def aggregate( self, func: Union[str, List[str]], numeric_only: Optional[bool] = False ) -> "pd.DataFrame": """ Used to groupby and aggregate Parameters ---------- func: Functions to use for aggregating the data. Accepted combinations are: - function - list of functions numeric_only: {True, False, None} Default is None Which datatype to be returned - True: returns all values with float64, NaN/NaT are ignored. - False: returns all values with float64. - None: returns all values with default datatype. Returns ------- pandas.DataFrame aggregation value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice ... dayOfWeek min max ... min max DestCountry ... AE 110.799911 1126.148682 ... 0 6 AR 125.589394 1199.642822 ... 0 6 AT 100.020531 1181.835815 ... 0 6 AU 102.294312 1197.632690 ... 0 6 CA 100.557251 1198.852539 ... 0 6 ... ... ... ... ... .. RU 101.004005 1196.742310 ... 0 6 SE 102.877190 1198.621582 ... 0 6 TR 142.876465 855.935547 ... 0 6 US 100.145966 1199.729004 ... 0 6 ZA 102.002663 1196.186157 ... 0 6 <BLANKLINE> [32 rows x 6 columns] """ # Controls whether a MultiIndex is used for the # columns of the result DataFrame. is_dataframe_agg = True if isinstance(func, str): func = [func] is_dataframe_agg = False return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=func, dropna=self._dropna, numeric_only=numeric_only, is_dataframe_agg=is_dataframe_agg, )
agg = aggregate
[docs] def count(self) -> "pd.DataFrame": """ Compute the count value for each group. Returns ------- pandas.DataFrame nunique value for each numeric column of each group See Also -------- :pandas_api_docs:`pandas.core.groupby.GroupBy.count` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice Cancelled dayOfWeek DestCountry AE 46 46 46 AR 305 305 305 AT 377 377 377 AU 416 416 416 CA 944 944 944 ... ... ... ... RU 739 739 739 SE 255 255 255 TR 10 10 10 US 1987 1987 1987 ZA 283 283 283 <BLANKLINE> [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["count"], dropna=self._dropna, numeric_only=False, is_dataframe_agg=False, )
def mode(self) -> None: raise NotImplementedError("Currently mode is not supported for groupby")