# SPDX-License-Identifier: Apache-2.0
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from typing import TYPE_CHECKING, List, Optional, Union
from opensearch_py_ml.query_compiler import QueryCompiler
if TYPE_CHECKING:
import pandas as pd # type: ignore
class GroupBy:
"""
Base class for calls to :py:func:`opensearch_py_ml.DataFrame.groupby`
"""
def __init__(
self,
by: List[str],
query_compiler: "QueryCompiler",
dropna: bool = True,
) -> None:
self._query_compiler: "QueryCompiler" = QueryCompiler(to_copy=query_compiler)
self._dropna: bool = dropna
self._by: List[str] = by
[docs]
class DataFrameGroupBy(GroupBy):
"""
This holds all the groupby methods for :py:func:`opensearch_py_ml.DataFrame.groupby`
"""
[docs]
def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the mean value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
mean value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627
AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094
AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287
CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150
... ... ... ... ...
RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104
SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838
TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000
US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215
ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["mean"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def var(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the variance value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
variance value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.var`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 75789.979090 0.130443 3.950549
AR 59683.055316 0.125979 3.783429
AT 65726.669676 0.144610 4.090013
AU 65088.483446 0.113094 3.833562
CA 68149.950516 0.116496 3.688139
... ... ... ...
RU 67305.277617 0.114107 3.852666
SE 53740.570338 0.127062 3.942132
TR 61245.521047 0.094868 4.100420
US 74349.939410 0.109638 3.758700
ZA 62920.072901 0.126608 3.775609
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["var"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def std(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the standard deviation value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
standard deviation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.std`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 279.875500 0.367171 2.020634
AR 244.903626 0.355811 1.949901
AT 256.883342 0.381035 2.026411
AU 255.585377 0.336902 1.961486
CA 261.263054 0.341587 1.921980
... ... ... ...
RU 259.696213 0.338140 1.964815
SE 232.504297 0.357510 1.991340
TR 267.827572 0.333333 2.191454
US 272.774819 0.331242 1.939469
ZA 251.505568 0.356766 1.948258
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["std"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the median absolute deviation value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
median absolute deviation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").mad() # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 233.697174 NaN 1.5
AR 189.250061 NaN 2.0
AT 195.823669 NaN 2.0
AU 202.539764 NaN 2.0
CA 203.344696 NaN 2.0
... ... ... ...
RU 206.431702 NaN 2.0
SE 178.658447 NaN 2.0
TR 221.863434 NaN 1.0
US 228.461365 NaN 2.0
ZA 192.162842 NaN 2.0
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["mad"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the sum value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
sum value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 2.783612e+04 7.0 124.0
AR 2.058223e+05 45.0 837.0
AT 2.437872e+05 66.0 1083.0
AU 2.785365e+05 54.0 1183.0
CA 6.124173e+05 127.0 2786.0
... ... ... ...
RU 4.899533e+05 97.0 2093.0
SE 1.684563e+05 38.0 684.0
TR 4.852532e+03 1.0 19.0
US 1.183804e+06 249.0 5472.0
ZA 1.819840e+05 42.0 783.0
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["sum"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def min(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the min value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
min value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.min`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 110.799911 False 0 2018-01-01 19:31:30
AR 125.589394 False 0 2018-01-01 01:30:47
AT 100.020531 False 0 2018-01-01 05:24:19
AU 102.294312 False 0 2018-01-01 00:00:00
CA 100.557251 False 0 2018-01-01 00:44:08
... ... ... ... ...
RU 101.004005 False 0 2018-01-01 01:01:51
SE 102.877190 False 0 2018-01-01 04:09:38
TR 142.876465 False 0 2018-01-01 06:45:17
US 100.145966 False 0 2018-01-01 00:06:27
ZA 102.002663 False 0 2018-01-01 06:44:44
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["min"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def max(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the max value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
max value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.max`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 1126.148682 True 6 2018-02-11 04:11:14
AR 1199.642822 True 6 2018-02-11 17:09:05
AT 1181.835815 True 6 2018-02-11 23:12:33
AU 1197.632690 True 6 2018-02-11 21:39:01
CA 1198.852539 True 6 2018-02-11 23:04:08
... ... ... ... ...
RU 1196.742310 True 6 2018-02-11 20:03:31
SE 1198.621582 True 6 2018-02-11 22:06:14
TR 855.935547 True 6 2018-02-04 01:59:23
US 1199.729004 True 6 2018-02-11 23:27:00
ZA 1196.186157 True 6 2018-02-11 23:29:45
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["max"],
dropna=self._dropna,
numeric_only=numeric_only,
)
[docs]
def nunique(self) -> "pd.DataFrame":
"""
Compute the nunique value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
nunique value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 46 2 7
AR 305 2 7
AT 377 2 7
AU 416 2 7
CA 944 2 7
... ... ... ...
RU 739 2 7
SE 255 2 7
TR 10 2 5
US 1987 2 7
ZA 283 2 7
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["nunique"],
dropna=self._dropna,
numeric_only=False,
)
[docs]
def quantile(
self, q: Union[int, float, List[int], List[float]] = 0.5
) -> "pd.DataFrame":
"""
Used to groupby and calculate quantile for a given DataFrame.
Parameters
----------
q:
float or array like, default 0.5
Value between 0 <= q <= 1, the quantile(s) to compute.
Returns
-------
pandas.DataFrame
quantile value for each grouped column
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.quantile`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> oml_df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')
>>> oml_flights = oml_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
>>> oml_flights.groupby(["dayOfWeek", "Cancelled"]).quantile() # doctest: +SKIP
AvgTicketPrice FlightDelayMin
dayOfWeek Cancelled
0 False 572.290384 0.0
True 578.140564 0.0
1 False 567.980560 0.0
True 582.618713 0.0
2 False 590.170986 0.0
True 579.811890 0.0
3 False 574.131340 0.0
True 572.852264 0.0
4 False 591.533699 0.0
True 582.877014 0.0
5 False 791.622625 0.0
True 793.362946 0.0
6 False 817.378523 0.0
True 766.855530 0.0
>>> oml_flights.groupby(["dayOfWeek", "Cancelled"]).quantile(q=[.2, .5]) # doctest: +SKIP
AvgTicketPrice FlightDelayMin
dayOfWeek Cancelled
0 False 0.2 319.925979 0.0
0.5 572.290384 0.0
True 0.2 325.704562 0.0
0.5 578.140564 0.0
1 False 0.2 327.311007 0.0
0.5 567.980560 0.0
True 0.2 336.839572 0.0
0.5 582.618713 0.0
2 False 0.2 332.323011 0.0
0.5 590.170986 0.0
True 0.2 314.472537 0.0
0.5 579.811890 0.0
3 False 0.2 327.652659 0.0
0.5 574.131340 0.0
True 0.2 298.483032 0.0
0.5 572.852264 0.0
4 False 0.2 314.290205 0.0
0.5 591.533699 0.0
True 0.2 325.024850 0.0
0.5 582.877014 0.0
5 False 0.2 567.362137 0.0
0.5 791.622625 0.0
True 0.2 568.323944 0.0
0.5 793.362946 0.0
6 False 0.2 568.489746 0.0
0.5 817.378523 0.0
True 0.2 523.890680 0.0
0.5 766.855530 0.0
"""
return self._query_compiler.aggs_groupby(
by=self._by, pd_aggs=["quantile"], quantiles=q, numeric_only=True
)
[docs]
def aggregate(
self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
) -> "pd.DataFrame":
"""
Used to groupby and aggregate
Parameters
----------
func:
Functions to use for aggregating the data.
Accepted combinations are:
- function
- list of functions
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: returns all values with float64, NaN/NaT are ignored.
- False: returns all values with float64.
- None: returns all values with default datatype.
Returns
-------
pandas.DataFrame
aggregation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice ... dayOfWeek
min max ... min max
DestCountry ...
AE 110.799911 1126.148682 ... 0 6
AR 125.589394 1199.642822 ... 0 6
AT 100.020531 1181.835815 ... 0 6
AU 102.294312 1197.632690 ... 0 6
CA 100.557251 1198.852539 ... 0 6
... ... ... ... ... ..
RU 101.004005 1196.742310 ... 0 6
SE 102.877190 1198.621582 ... 0 6
TR 142.876465 855.935547 ... 0 6
US 100.145966 1199.729004 ... 0 6
ZA 102.002663 1196.186157 ... 0 6
<BLANKLINE>
[32 rows x 6 columns]
"""
# Controls whether a MultiIndex is used for the
# columns of the result DataFrame.
is_dataframe_agg = True
if isinstance(func, str):
func = [func]
is_dataframe_agg = False
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=func,
dropna=self._dropna,
numeric_only=numeric_only,
is_dataframe_agg=is_dataframe_agg,
)
agg = aggregate
[docs]
def count(self) -> "pd.DataFrame":
"""
Compute the count value for each group.
Returns
-------
pandas.DataFrame
nunique value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.count`
Examples
--------
>>> from tests import OPENSEARCH_TEST_CLIENT
>>> df = oml.DataFrame(
... OPENSEARCH_TEST_CLIENT, "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 46 46 46
AR 305 305 305
AT 377 377 377
AU 416 416 416
CA 944 944 944
... ... ... ...
RU 739 739 739
SE 255 255 255
TR 10 10 10
US 1987 1987 1987
ZA 283 283 283
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["count"],
dropna=self._dropna,
numeric_only=False,
is_dataframe_agg=False,
)
def mode(self) -> None:
raise NotImplementedError("Currently mode is not supported for groupby")