Source code for opensearch_py_ml.series

# SPDX-License-Identifier: Apache-2.0
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.


#  Licensed to Elasticsearch B.V. under one or more contributor
#  license agreements. See the NOTICE file distributed with
#  this work for additional information regarding copyright
#  ownership. Elasticsearch B.V. licenses this file to you under
#  the Apache License, Version 2.0 (the "License"); you may
#  not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
# 	http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

"""
Series
---------
One-dimensional ndarray with axis labels (including time series).

The underlying data resides in OpenSearch and the API aligns as much as
possible with pandas.DataFrame API.

This allows the opensearch_py_ml.Series to access large datasets stored in OpenSearch,
without storing the dataset in local memory.

Implementation Details
----------------------
Based on NDFrame which underpins opensearch_py_ml.DataFrame
"""

import sys
import warnings
from collections.abc import Collection
from datetime import datetime
from io import StringIO
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd  # type: ignore
from pandas.io.common import _expand_user, stringify_path  # type: ignore

import opensearch_py_ml.plotting
from opensearch_py_ml.arithmetics import (
    ArithmeticNumber,
    ArithmeticSeries,
    ArithmeticString,
)
from opensearch_py_ml.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from opensearch_py_ml.filter import (
    BooleanFilter,
    Equal,
    Greater,
    GreaterEqual,
    IsIn,
    IsNull,
    Less,
    LessEqual,
    NotFilter,
    NotNull,
    QueryFilter,
    ScriptFilter,
)
from opensearch_py_ml.ndframe import NDFrame
from opensearch_py_ml.utils import to_list

if TYPE_CHECKING:
    from opensearchpy import OpenSearch

    from opensearch_py_ml.query_compiler import QueryCompiler


def _get_method_name() -> str:
    return sys._getframe(1).f_code.co_name


[docs] class Series(NDFrame): """ pandas.Series like API that proxies into OpenSearch index(os). Parameters ---------- os_client : opensearchpy.OpenSearch A reference to a OpenSearch python client os_index_pattern : str An OpenSearch index pattern. This can contain wildcards. os_index_field : str The field to base the series on Notes ----- If the OpenSearch index is deleted or index mappings are changed after this object is created, the object is not rebuilt and so inconsistencies can occur. See Also -------- :pandas_api_docs:`pandas.Series` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> oml.Series(os_client=OPENSEARCH_TEST_CLIENT, os_index_pattern='flights', name='Carrier') 0 Kibana Airlines 1 Logstash Airways 2 Logstash Airways 3 Kibana Airlines 4 Kibana Airlines ... 13054 Logstash Airways 13055 Logstash Airways 13056 Logstash Airways 13057 JetBeats 13058 JetBeats Name: Carrier, Length: 13059, dtype: object """ def __init__( self, os_client: Optional["OpenSearch"] = None, os_index_pattern: Optional[str] = None, name: Optional[str] = None, os_index_field: Optional[str] = None, _query_compiler: Optional["QueryCompiler"] = None, ) -> None: # Series has 1 column if name is None: columns = None else: columns = [name] super().__init__( os_client=os_client, os_index_pattern=os_index_pattern, columns=columns, os_index_field=os_index_field, _query_compiler=_query_compiler, ) hist = opensearch_py_ml.plotting.oml_hist_series @property def empty(self) -> bool: """Determines if the Series is empty. Returns: True if the Series is empty. False otherwise. """ return len(self.index) == 0 @property def shape(self) -> Tuple[int, int]: """ Return a tuple representing the dimensionality of the Series. Returns ------- shape: tuple 0. number of rows 1. number of columns Notes ----- - number of rows ``len(series)`` queries OpenSearch - number of columns == 1 Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.Series(OPENSEARCH_TEST_CLIENT, 'ecommerce', name='total_quantity') >>> df.shape (4675, 1) """ num_rows = len(self) num_columns = 1 return num_rows, num_columns @property def os_field_name(self) -> pd.Index: """ Returns ------- os_field_name: str Return the OpenSearch field name for this series """ return self._query_compiler.get_field_names(include_scripted_fields=True)[0] @property def name(self) -> pd.Index: return self._query_compiler.columns[0] @name.setter def name(self, name: str) -> None: self._query_compiler.rename({self.name: name}, inplace=True)
[docs] def rename(self, new_name: str) -> "Series": """ Rename name of series. Only column rename is supported. This does not change the underlying index, but adds a symbolic link from the new name (column) to the OpenSearch field name. For instance, if a field was called 'total_quantity' it could be renamed 'Total Quantity'. Parameters ---------- new_name: str Returns ------- opensearch_py_ml.Series opensearch_py_ml.Series with new name. See Also -------- :pandas_api_docs:`pandas.Series.rename` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df.Carrier 0 Kibana Airlines 1 Logstash Airways 2 Logstash Airways 3 Kibana Airlines 4 Kibana Airlines ... 13054 Logstash Airways 13055 Logstash Airways 13056 Logstash Airways 13057 JetBeats 13058 JetBeats Name: Carrier, Length: 13059, dtype: object >>> df.Carrier.rename('Airline') 0 Kibana Airlines 1 Logstash Airways 2 Logstash Airways 3 Kibana Airlines 4 Kibana Airlines ... 13054 Logstash Airways 13055 Logstash Airways 13056 Logstash Airways 13057 JetBeats 13058 JetBeats Name: Airline, Length: 13059, dtype: object """ return Series( _query_compiler=self._query_compiler.rename({self.name: new_name}) )
[docs] def head(self, n: int = 5) -> "Series": return Series(_query_compiler=self._query_compiler.head(n))
[docs] def tail(self, n: int = 5) -> "Series": return Series(_query_compiler=self._query_compiler.tail(n))
[docs] def sample( self, n: Optional[int] = None, frac: Optional[float] = None, random_state: Optional[int] = None, ) -> "Series": return Series( _query_compiler=self._query_compiler.sample(n, frac, random_state) )
[docs] def value_counts(self, os_size: int = 10) -> pd.Series: """ Return the value counts for the specified field. **Note we can only do this for aggregatable OpenSearch fields - (in general) numeric and keyword rather than text fields** TODO - implement remainder of pandas arguments Parameters ---------- os_size: int, default 10 Number of buckets to return counts for, automatically sorts by count descending. This parameter is specific to `opensearch_py_ml`, and determines how many term buckets OpenSearch should return out of the overall terms list. Returns ------- pandas.Series number of occurrences of each value in the column See Also -------- :pandas_api_docs:`pandas.Series.value_counts` :os_api_docs:`search-aggregations-bucket-terms-aggregation` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> for key, value in df['Carrier'].value_counts().items(): ... print(key, value) Logstash Airways 3331 JetBeats 3274 Kibana Airlines 3234 ES-Air 3220 """ if not isinstance(os_size, int): raise TypeError("os_size must be a positive integer.") elif os_size <= 0: raise ValueError("os_size must be a positive integer.") return self._query_compiler.value_counts(os_size)
# dtype not implemented for Series as causes query to fail # in pandas.core.computation.ops.Term.type # ---------------------------------------------------------------------- # Rendering Methods def __repr__(self) -> str: """ Return a string representation for a particular Series. """ buf = StringIO() # max_rows and max_cols determine the maximum size of the pretty printed tabular # representation of the series. pandas defaults are 60 and 20 respectively. # series where len(series) > max_rows shows a truncated view with 10 rows shown. max_rows = pd.get_option("display.max_rows") min_rows = pd.get_option("display.min_rows") if max_rows and len(self) > max_rows: max_rows = min_rows show_dimensions = pd.get_option("display.show_dimensions") self.to_string( buf=buf, name=True, dtype=True, min_rows=min_rows, max_rows=max_rows, length=show_dimensions, ) result = buf.getvalue() return result
[docs] @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED) def to_string( self, buf=None, na_rep="NaN", float_format=None, header=True, index=True, length=False, dtype=False, name=False, max_rows=None, min_rows=None, ) -> Optional[str]: """ Render a string representation of the Series. Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``. See Also -------- :pandas_api_docs:`pandas.Series.to_string` for argument details. """ # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this # by limiting rows by default. num_rows = len(self) # avoid multiple calls if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: if max_rows is None: max_rows = num_rows else: max_rows = min(num_rows, max_rows) elif max_rows is None: warnings.warn( f"Series.to_string called without max_rows set " f"- this will return entire index results. " f"Setting max_rows={DEFAULT_NUM_ROWS_DISPLAYED}" f" overwrite if different behaviour is required.", UserWarning, ) max_rows = DEFAULT_NUM_ROWS_DISPLAYED # because of the way pandas handles max_rows=0, not having this throws an error # see opensearch_py_ml issue #56 if max_rows == 0: max_rows = 1 # Create a slightly bigger dataframe than display temp_series = self._build_repr(max_rows + 1) if buf is not None: _buf = _expand_user(stringify_path(buf)) else: _buf = StringIO() if num_rows == 0: # Empty series are rendered differently than # series with items. We can luckily use our # example series in this case. temp_series.head(0).to_string( buf=_buf, na_rep=na_rep, float_format=float_format, header=header, index=index, length=length, dtype=dtype, name=name, max_rows=max_rows, ) else: # Create repr of fake series without name, length, dtype summary temp_series.to_string( buf=_buf, na_rep=na_rep, float_format=float_format, header=header, index=index, length=False, dtype=False, name=False, max_rows=max_rows, ) # Create the summary footer = [] if name and self.name is not None: footer.append(f"Name: {self.name}") if length and len(self) > max_rows: footer.append(f"Length: {len(self.index)}") if dtype: footer.append(f"dtype: {temp_series.dtype}") if footer: _buf.write(f"\n{', '.join(footer)}") if buf is None: result = _buf.getvalue() return result
[docs] def to_pandas(self, show_progress: bool = False) -> pd.Series: return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
@property def dtype(self) -> np.dtype: """ Return the dtype object of the underlying data. See Also -------- :pandas_api_docs:`pandas.Series.dtype` """ return self._query_compiler.dtypes[0] @property def os_dtype(self) -> str: """ Return the OpenSearch type of the underlying data. """ return self._query_compiler.os_dtypes[0] def __gt__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, np.datetime64): # convert numpy datetime64 object it has no `strftime` method other = pd.to_datetime(other) if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value > doc['{other.name}'].value" return ScriptFilter(painless, lang="painless") elif isinstance(other, (int, float, datetime)): return Greater(field=self.name, value=other) else: raise NotImplementedError(other, type(other)) def __lt__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, np.datetime64): other = pd.to_datetime(other) if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value < doc['{other.name}'].value" return ScriptFilter(painless, lang="painless") elif isinstance(other, (int, float, datetime)): return Less(field=self.name, value=other) else: raise NotImplementedError(other, type(other)) def __ge__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, np.datetime64): other = pd.to_datetime(other) if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value >= doc['{other.name}'].value" return ScriptFilter(painless, lang="painless") elif isinstance(other, (int, float, datetime)): return GreaterEqual(field=self.name, value=other) else: raise NotImplementedError(other, type(other)) def __le__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, np.datetime64): other = pd.to_datetime(other) if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value <= doc['{other.name}'].value" return ScriptFilter(painless, lang="painless") elif isinstance(other, (int, float, datetime)): return LessEqual(field=self.name, value=other) else: raise NotImplementedError(other, type(other)) def __eq__(self, other: Union[int, float, str, "Series"]) -> BooleanFilter: if isinstance(other, np.datetime64): other = pd.to_datetime(other) if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value == doc['{other.name}'].value" return ScriptFilter(painless, lang="painless") elif isinstance(other, (int, float, datetime)): return Equal(field=self.name, value=other) elif isinstance(other, str): return Equal(field=self.name, value=other) else: raise NotImplementedError(other, type(other)) def __ne__(self, other: Union[int, float, str, "Series"]) -> BooleanFilter: if isinstance(other, np.datetime64): other = pd.to_datetime(other) if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value != doc['{other.name}'].value" return ScriptFilter(painless, lang="painless") elif isinstance(other, (int, float, datetime)): return NotFilter(Equal(field=self.name, value=other)) elif isinstance(other, str): return NotFilter(Equal(field=self.name, value=other)) else: raise NotImplementedError(other, type(other))
[docs] def isin(self, other: Union[Collection, pd.Series]) -> BooleanFilter: if isinstance(other, (Collection, pd.Series)): return IsIn(field=self.name, value=to_list(other)) else: raise NotImplementedError(other, type(other))
[docs] def isna(self) -> BooleanFilter: """ Detect missing values. Returns ------- opensearch_py_ml.Series Mask of bool values for each element in Series that indicates whether an element is not an NA value. See Also -------- :pandas_api_docs:`pandas.Series.isna` """ return IsNull(field=self.name)
isnull = isna
[docs] def notna(self) -> BooleanFilter: """ Detect existing (non-missing) values. Returns ------- opensearch_py_ml.Series Mask of bool values for each element in Series that indicates whether an element is not an NA value See Also -------- :pandas_api_docs:`pandas.Series.notna` """ return NotNull(field=self.name)
notnull = notna
[docs] def quantile( self, q: Union[int, float, List[int], List[float]] = 0.5 ) -> Union[pd.Series, Any]: """ Used to calculate quantile for a given Series. Parameters ---------- q: float or array like, default 0.5 Value between 0 <= q <= 1, the quantile(s) to compute. Returns ------- pandas.Series or any single dtype See Also -------- :pandas_api_docs:`pandas.Series.quantile` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> oml_flights = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> oml_flights["timestamp"].quantile([.2,.5,.75]) # doctest: +SKIP 0.20 2018-01-09 04:30:57.289159912 0.50 2018-01-21 23:39:27.031627441 0.75 2018-02-01 04:54:59.256136963 Name: timestamp, dtype: datetime64[ns] >>> oml_flights["dayOfWeek"].quantile() # doctest: +SKIP 3.0 >>> oml_flights["timestamp"].quantile() # doctest: +SKIP Timestamp('2018-01-22 00:12:48.844534180') """ return self._query_compiler.quantile( quantiles=q, numeric_only=None, is_dataframe=False )
@property def ndim(self) -> int: """ Returns 1 by definition of a Series Returns ------- int By definition 1 See Also -------- :pandas_api_docs:`pandas.Series.ndim` """ return 1
[docs] def filter( self, items: Optional[Sequence[str]] = None, like: Optional[str] = None, regex: Optional[str] = None, axis: Optional[Union[int, str]] = None, ) -> "Series": """ Subset the dataframe rows or columns according to the specified index labels. Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index. Parameters ---------- items : list-like Keep labels from axis which are in items. like : str Keep labels from axis for which "like in label == True". regex : str (regular expression) Keep labels from axis for which re.search(regex, label) == True. axis : {0 or ‘index’, 1 or ‘columns’, None}, default None The axis to filter on, expressed either as an index (int) or axis name (str). By default this is the info axis, ‘index’ for Series, ‘columns’ for DataFrame. Returns ------- opensearch_py_ml.Series See Also -------- :pandas_api_docs:`pandas.Series.filter` Notes ----- The ``items``, ``like``, and ``regex`` parameters are enforced to be mutually exclusive. """ filter_options_passed = sum([items is not None, bool(like), bool(regex)]) if filter_options_passed > 1: raise TypeError( "Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive" ) elif filter_options_passed == 0: raise TypeError("Must pass either 'items', 'like', or 'regex'") # axis defaults to 'columns' for DataFrame, 'index' for Series if axis is None: axis = "index" pd.Series._get_axis_name(axis) new_query_compiler = self._query_compiler.filter( items=items, like=like, regex=regex ) return Series(_query_compiler=new_query_compiler)
[docs] def mode(self, os_size: int = 10) -> pd.Series: """ Calculate mode of a series Parameters ---------- os_size: default 10 number of rows to be returned if mode has multiple values See Also -------- :pandas_api_docs:`pandas.Series.mode` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> oml_ecommerce = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce') >>> oml_ecommerce["day_of_week"].mode() 0 Thursday Name: day_of_week, dtype: object >>> oml_ecommerce["order_date"].mode() 0 2016-12-02 20:36:58 1 2016-12-04 23:44:10 2 2016-12-08 06:21:36 3 2016-12-08 09:38:53 4 2016-12-12 11:38:24 5 2016-12-12 19:46:34 6 2016-12-14 18:00:00 7 2016-12-15 11:38:24 8 2016-12-22 19:39:22 9 2016-12-24 06:21:36 Name: order_date, dtype: datetime64[ns] >>> oml_ecommerce["order_date"].mode(os_size=3) 0 2016-12-02 20:36:58 1 2016-12-04 23:44:10 2 2016-12-08 06:21:36 Name: order_date, dtype: datetime64[ns] """ return self._query_compiler.mode(is_dataframe=False, os_size=os_size)
[docs] def os_match( self, text: str, *, match_phrase: bool = False, match_only_text_fields: bool = True, analyzer: Optional[str] = None, fuzziness: Optional[Union[int, str]] = None, **kwargs: Any, ) -> QueryFilter: """Filters data with an OpenSearch ``match`` or ``match_phrase`` query depending on the given parameters. Read more about `Full-Text Queries in OpenSearch <https://opensearch.org/docs/latest/opensearch/query-dsl/full-text/>`_ All additional keyword arguments are passed in the body of the match query. Parameters ---------- text: str String of text to search for match_phrase: bool, default False If True will use ``match_phrase`` instead of ``match`` query which takes into account the order of the ``text`` parameter. match_only_text_fields: bool, default True When True this function will raise an error if any non-text fields are queried to prevent fields that aren't analyzed from not working properly. Set to False to ignore this preventative check. analyzer: str, optional Specify which analyzer to use for the match query fuzziness: int, str, optional Specify the fuzziness option for the match query Returns ------- QueryFilter Boolean filter to be combined with other filters and then passed to DataFrame[...]. Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame( ... OPENSEARCH_TEST_CLIENT, "ecommerce", ... columns=["category", "taxful_total_price"] ... ) >>> df[ ... df.category.os_match("Men's") ... & (df.taxful_total_price > 200.0) ... ].head(5) category taxful_total_price 13 [Men's Clothing] 266.96 33 [Men's Clothing] 221.98 54 [Men's Clothing] 234.98 93 [Men's Shoes, Women's Accessories] 239.98 273 [Men's Shoes] 214.98 <BLANKLINE> [5 rows x 2 columns] """ return self._query_compiler.os_match( text, columns=[self.name], match_phrase=match_phrase, match_only_text_fields=match_only_text_fields, analyzer=analyzer, fuzziness=fuzziness, **kwargs, )
[docs] def os_info(self) -> str: buf = StringIO() super()._os_info(buf) return buf.getvalue()
def __add__(self, right: "Series") -> "Series": """ Return addition of series and right, element-wise (binary operator add). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.taxful_total_price + 1 0 37.980000 1 54.980000 2 200.979996 3 175.979996 4 81.980003 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price + df.total_quantity 0 38.980000 1 55.980000 2 201.979996 3 176.979996 4 82.980003 dtype: float64 >>> df.customer_first_name + df.customer_last_name 0 EddieUnderwood 1 MaryBailey 2 GwenButler 3 DianeChandler 4 EddieWeber dtype: object >>> "First name: " + df.customer_first_name 0 First name: Eddie 1 First name: Mary 2 First name: Gwen 3 First name: Diane 4 First name: Eddie Name: customer_first_name, dtype: object """ return self._numeric_op(right, _get_method_name()) def __truediv__(self, right: "Series") -> "Series": """ Return floating division of series and right, element-wise (binary operator truediv). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price / df.total_quantity 0 18.490000 1 26.990000 2 99.989998 3 87.489998 4 40.490002 dtype: float64 """ return self._numeric_op(right, _get_method_name()) def __floordiv__(self, right: "Series") -> "Series": """ Return integer division of series and right, element-wise (binary operator floordiv //). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price // df.total_quantity 0 18.0 1 26.0 2 99.0 3 87.0 4 40.0 dtype: float64 """ return self._numeric_op(right, _get_method_name()) def __mod__(self, right: "Series") -> "Series": """ Return modulo of series and right, element-wise (binary operator mod %). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price % df.total_quantity 0 0.980000 1 1.980000 2 1.979996 3 0.979996 4 0.980003 dtype: float64 """ return self._numeric_op(right, _get_method_name()) def __mul__(self, right: "Series") -> "Series": """ Return multiplication of series and right, element-wise (binary operator mul). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price * df.total_quantity 0 73.959999 1 107.959999 2 399.959991 3 349.959991 4 161.960007 dtype: float64 """ return self._numeric_op(right, _get_method_name()) def __sub__(self, right: "Series") -> "Series": """ Return subtraction of series and right, element-wise (binary operator sub). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price - df.total_quantity 0 34.980000 1 51.980000 2 197.979996 3 172.979996 4 78.980003 dtype: float64 """ return self._numeric_op(right, _get_method_name()) def __pow__(self, right: "Series") -> "Series": """ Return exponential power of series and right, element-wise (binary operator pow). Parameters ---------- right: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> df.taxful_total_price ** df.total_quantity 0 1367.520366 1 2913.840351 2 39991.998691 3 30617.998905 4 6557.760944 dtype: float64 """ return self._numeric_op(right, _get_method_name()) def __radd__(self, left: "Series") -> "Series": """ Return addition of series and left, element-wise (binary operator add). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> 1 + df.taxful_total_price 0 37.980000 1 54.980000 2 200.979996 3 175.979996 4 81.980003 Name: taxful_total_price, dtype: float64 """ return self._numeric_op(left, _get_method_name()) def __rtruediv__(self, left: "Series") -> "Series": """ Return division of series and left, element-wise (binary operator div). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> 1.0 / df.taxful_total_price 0 0.027042 1 0.018525 2 0.005001 3 0.005715 4 0.012349 Name: taxful_total_price, dtype: float64 """ return self._numeric_op(left, _get_method_name()) def __rfloordiv__(self, left: "Series") -> "Series": """ Return integer division of series and left, element-wise (binary operator floordiv //). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> 500.0 // df.taxful_total_price 0 13.0 1 9.0 2 2.0 3 2.0 4 6.0 Name: taxful_total_price, dtype: float64 """ return self._numeric_op(left, _get_method_name()) def __rmod__(self, left: "Series") -> "Series": """ Return modulo of series and left, element-wise (binary operator mod %). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> 500.0 % df.taxful_total_price 0 19.260006 1 14.180004 2 100.040009 3 150.040009 4 14.119980 Name: taxful_total_price, dtype: float64 """ return self._numeric_op(left, _get_method_name()) def __rmul__(self, left: "Series") -> "Series": """ Return multiplication of series and left, element-wise (binary operator mul). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> 10.0 * df.taxful_total_price 0 369.799995 1 539.799995 2 1999.799957 3 1749.799957 4 809.800034 Name: taxful_total_price, dtype: float64 """ return self._numeric_op(left, _get_method_name()) def __rpow__(self, left: "Series") -> "Series": """ Return exponential power of series and left, element-wise (binary operator pow). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.total_quantity 0 2 1 2 2 2 3 2 4 2 Name: total_quantity, dtype: int64 >>> np.int_(2) ** df.total_quantity 0 4.0 1 4.0 2 4.0 3 4.0 4 4.0 Name: total_quantity, dtype: float64 """ return self._numeric_op(left, _get_method_name()) def __rsub__(self, left: "Series") -> "Series": """ Return subtraction of series and left, element-wise (binary operator sub). Parameters ---------- left: opensearch_py_ml.Series Returns ------- opensearch_py_ml.Series Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 2 199.98 3 174.98 4 80.98 Name: taxful_total_price, dtype: float64 >>> 1.0 - df.taxful_total_price 0 -35.980000 1 -52.980000 2 -198.979996 3 -173.979996 4 -79.980003 Name: taxful_total_price, dtype: float64 """ return self._numeric_op(left, _get_method_name()) add = __add__ div = __truediv__ divide = __truediv__ floordiv = __floordiv__ mod = __mod__ mul = __mul__ multiply = __mul__ pow = __pow__ sub = __sub__ subtract = __sub__ truediv = __truediv__ radd = __radd__ rdiv = __rtruediv__ rdivide = __rtruediv__ rfloordiv = __rfloordiv__ rmod = __rmod__ rmul = __rmul__ rmultiply = __rmul__ rpow = __rpow__ rsub = __rsub__ rsubtract = __rsub__ rtruediv = __rtruediv__ # __div__ is technically Python 2.x only # but pandas has it so we do too. __div__ = __truediv__ __rdiv__ = __rtruediv__ def _numeric_op(self, right: Any, method_name: str) -> "Series": """ return a op b a & b == Series a & b must share same OpenSearch client, index_pattern and index_field a == Series, b == numeric or string Naming of the resulting Series ------------------------------ result = SeriesA op SeriesB result.name == None result = SeriesA op np.number result.name == SeriesA.name result = SeriesA op str result.name == SeriesA.name Naming is consistent for rops """ # print("_numeric_op", self, right, method_name) if isinstance(right, Series): # Check we can the 2 Series are compatible (raises on error): self._query_compiler.check_arithmetics(right._query_compiler) right_object = ArithmeticSeries( right._query_compiler, right.name, right.dtype ) display_name = None elif np.issubdtype(np.dtype(type(right)), np.number): right_object = ArithmeticNumber(right, np.dtype(type(right))) display_name = self.name elif isinstance(right, str): right_object = ArithmeticString(right) display_name = self.name else: raise TypeError( f"unsupported operation type(s) [{method_name!r}] " f"for operands ['{type(self)}' with dtype '{self.dtype}', " f"'{type(right).__name__}']" ) left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype) left_object.arithmetic_operation(method_name, right_object) series = Series( _query_compiler=self._query_compiler.arithmetic_op_fields( display_name, left_object ) ) # force set name to 'display_name' series._query_compiler._mappings.display_names = [display_name] return series
[docs] def max(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the maximum of the Series values TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- float max value See Also -------- :pandas_api_docs:`pandas.Series.max` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.max()) 1199 """ results = super().max(numeric_only=numeric_only) return results.squeeze()
[docs] def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the mean of the Series values TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- float mean value See Also -------- :pandas_api_docs:`pandas.Series.mean` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.mean()) 628 """ results = super().mean(numeric_only=numeric_only) return results.squeeze()
[docs] def median(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the median of the Series values TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- float median value See Also -------- :pandas_api_docs:`pandas.Series.median` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.median()) 640 """ results = super().median(numeric_only=numeric_only) return results.squeeze()
[docs] def min(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the minimum of the Series values TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- float min value See Also -------- :pandas_api_docs:`pandas.Series.min` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.min()) 100 """ results = super().min(numeric_only=numeric_only) return results.squeeze()
[docs] def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the sum of the Series values TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- float sum of all values See Also -------- :pandas_api_docs:`pandas.Series.sum` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.sum()) 8204364 """ results = super().sum(numeric_only=numeric_only) return results.squeeze()
[docs] def nunique(self) -> pd.Series: """ Return the number of unique values in a Series Returns ------- int Number of unique values See Also -------- :pandas_api_docs:`pandas.Series.nunique` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['Carrier'] >>> s.nunique() 4 """ results = super().nunique() return results.squeeze()
[docs] def unique(self) -> pd.Series: """ Returns all unique values within a Series. Note that behavior is slightly different between pandas and opensearch_py_ml: pandas will return values in the order they're first seen and opensearch-py-ml returns values in sorted order. Returns ------- pd.Series A series containing unique values of given series is returned. See Also -------- :pandas_api_docs:`pandas.Series.unique` """ return self._query_compiler.unique()
[docs] def var(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return variance for a Series Returns ------- float var value See Also -------- :pandas_api_docs:`pandas.Series.var` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.var()) 70964 """ results = super().var(numeric_only=numeric_only) return results.squeeze()
[docs] def std(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return standard deviation for a Series Returns ------- float std value See Also -------- :pandas_api_docs:`pandas.Series.var` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.std()) 266 """ results = super().std(numeric_only=numeric_only) return results.squeeze()
[docs] def mad(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return median absolute deviation for a Series Returns ------- float mad value See Also -------- :pandas_api_docs:`pandas.Series.mad` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> s = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.mad()) 213 """ results = super().mad(numeric_only=numeric_only) return results.squeeze()
[docs] def describe(self) -> pd.Series: """ Generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values. Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types. The output will vary depending on what is provided. Refer to the notes below for more detail. TODO - add additional arguments (current only numeric values supported) Returns ------- pandas.Series: Summary information See Also -------- :pandas_api_docs:`pandas.Series.describe` Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') # ignoring percentiles as they don't generate consistent results >>> df.AvgTicketPrice.describe() # doctest: +SKIP count 13059.000000 mean 628.253689 std 266.386661 min 100.020531 ... ... ... max 1199.729004 Name: AvgTicketPrice, dtype: float64 """ return super().describe().squeeze()
# def values TODO - not implemented as causes current implementation of query to fail
[docs] def to_numpy(self) -> None: """ Not implemented. In pandas this returns a Numpy representation of the Series. This would involve scan/scrolling the entire index. If this is required, call ``oml.opensearch_to_pandas(oml_series).values``, *but beware this will scan/scroll the entire OpenSearch index(s) into memory.* See Also -------- :pandas_api_docs:`pandas.DataFrame.to_numpy` opensearch_to_pandas Examples -------- >>> from tests import OPENSEARCH_TEST_CLIENT >>> oml_s = oml.Series(OPENSEARCH_TEST_CLIENT, 'flights', name='Carrier').head(5) >>> pd_s = oml.opensearch_to_pandas(oml_s) >>> print(f"type(oml_s)={type(oml_s)}\\ntype(pd_s)={type(pd_s)}") type(oml_s)=<class 'opensearch_py_ml.series.Series'> type(pd_s)=<class 'pandas.core.series.Series'> >>> oml_s 0 Kibana Airlines 1 Logstash Airways 2 Logstash Airways 3 Kibana Airlines 4 Kibana Airlines Name: Carrier, dtype: object >>> pd_s.to_numpy() array(['Kibana Airlines', 'Logstash Airways', 'Logstash Airways', 'Kibana Airlines', 'Kibana Airlines'], dtype=object) """ raise NotImplementedError( "This method would scan/scroll the entire OpenSearch index(s) into memory." "If this is explicitly required and there is sufficient memory, call `oml.opensearch_to_pandas(oml_df).values`" )