Demo Notebook for Dataframe
Download notebook
Step 0: Imports
[2]:
%load_ext autoreload
%autoreload 2
[3]:
# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made
import warnings
warnings.filterwarnings('ignore')
[4]:
# imports to demonstrate DataFrame support
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import opensearch_py_ml as oml
from opensearchpy import OpenSearch
# Import standard test settings for consistent results
from opensearch_py_ml.conftest import *
Step 1: Setup clients
[5]:
CLUSTER_URL = 'https://localhost:9200'
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='< admin password >'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client
[6]:
client = get_os_client()
Comparing similarities and differences between panda dataframe vs opensearch_py_ml dataframe
Create an opensearch_py_ml.DataFrame from a flights index
[7]:
oml_flights = oml.DataFrame(client, 'flights')
[8]:
type(oml_flights)
[8]:
opensearch_py_ml.dataframe.DataFrame
Compare to pandas DataFrame (created from the same data)
[9]:
pd_flights = oml.opensearch_to_pandas(oml_flights)
[10]:
type(pd_flights)
[10]:
pandas.core.frame.DataFrame
Attributes and underlying data
DataFrame.columns
[11]:
pd_flights.columns
[11]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')
[12]:
oml_flights.columns
[12]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')
DataFrame.dtypes
[13]:
pd_flights.dtypes
[13]:
AvgTicketPrice           float64
Cancelled                   bool
Carrier                   object
Dest                      object
DestAirportID             object
                       ...
OriginLocation            object
OriginRegion              object
OriginWeather             object
dayOfWeek                  int64
timestamp         datetime64[ns]
Length: 27, dtype: object
[14]:
oml_flights.dtypes
[14]:
AvgTicketPrice           float64
Cancelled                   bool
Carrier                   object
Dest                      object
DestAirportID             object
                       ...
OriginLocation            object
OriginRegion              object
OriginWeather             object
dayOfWeek                  int64
timestamp         datetime64[ns]
Length: 27, dtype: object
DataFrame.select_dtypes
[15]:
pd_flights.select_dtypes(include=np.number)
[15]:
| AvgTicketPrice | DistanceKilometers | ... | FlightTimeMin | dayOfWeek | |
|---|---|---|---|---|---|
| 0 | 841.265642 | 16492.326654 | ... | 1030.770416 | 0 | 
| 1 | 882.982662 | 8823.400140 | ... | 464.389481 | 0 | 
| 2 | 190.636904 | 0.000000 | ... | 0.000000 | 0 | 
| 3 | 181.694216 | 555.737767 | ... | 222.749059 | 0 | 
| 4 | 730.041778 | 13358.244200 | ... | 785.779071 | 0 | 
| ... | ... | ... | ... | ... | ... | 
| 13054 | 1080.446279 | 8058.581753 | ... | 402.929088 | 6 | 
| 13055 | 646.612941 | 7088.598322 | ... | 644.418029 | 6 | 
| 13056 | 997.751876 | 10920.652972 | ... | 937.540811 | 6 | 
| 13057 | 1102.814465 | 18748.859647 | ... | 1697.404971 | 6 | 
| 13058 | 858.144337 | 16809.141923 | ... | 1610.761827 | 6 | 
13059 rows × 7 columns
[16]:
oml_flights.select_dtypes(include=np.number)
[16]:
| AvgTicketPrice | DistanceKilometers | ... | FlightTimeMin | dayOfWeek | |
|---|---|---|---|---|---|
| 0 | 841.265642 | 16492.326654 | ... | 1030.770416 | 0 | 
| 1 | 882.982662 | 8823.400140 | ... | 464.389481 | 0 | 
| 2 | 190.636904 | 0.000000 | ... | 0.000000 | 0 | 
| 3 | 181.694216 | 555.737767 | ... | 222.749059 | 0 | 
| 4 | 730.041778 | 13358.244200 | ... | 785.779071 | 0 | 
| ... | ... | ... | ... | ... | ... | 
| 13054 | 1080.446279 | 8058.581753 | ... | 402.929088 | 6 | 
| 13055 | 646.612941 | 7088.598322 | ... | 644.418029 | 6 | 
| 13056 | 997.751876 | 10920.652972 | ... | 937.540811 | 6 | 
| 13057 | 1102.814465 | 18748.859647 | ... | 1697.404971 | 6 | 
| 13058 | 858.144337 | 16809.141923 | ... | 1610.761827 | 6 | 
13059 rows × 7 columns
DataFrame.empty
[17]:
pd_flights.empty
[17]:
False
[18]:
oml_flights.empty
[18]:
False
DataFrame.shape
[19]:
pd_flights.shape
[19]:
(13059, 27)
[20]:
oml_flights.shape
[20]:
(13059, 27)
DataFrame.index
Note, opensearch_py_ml.DataFrame.index does not mirror pandas.DataFrame.index.
[21]:
pd_flights.index
[21]:
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'],
      dtype='object', length=13059)
[22]:
# NBVAL_IGNORE_OUTPUT
oml_flights.index
[22]:
<opensearch_py_ml.index.Index at 0x1131744c0>
[24]:
oml_flights.index.os_index_field
[24]:
'_id'
DataFrame.values
Note, opensearch_py_ml.DataFrame.values is not supported.
[25]:
pd_flights.values
[25]:
array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0,
        Timestamp('2018-01-01 00:00:00')],
       [882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0,
        Timestamp('2018-01-01 18:27:00')],
       [190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0,
        Timestamp('2018-01-01 17:11:14')],
       ...,
       [997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6,
        Timestamp('2018-02-11 04:09:27')],
       [1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6,
        Timestamp('2018-02-11 08:28:21')],
       [858.1443369038839, False, 'JetBeats', ..., 'Rain', 6,
        Timestamp('2018-02-11 14:54:34')]], dtype=object)
[26]:
try:
    oml_flights.values
except AttributeError as e:
    print(e)
This method would scan/scroll the entire OpenSearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.opensearch_to_pandas(ed_df).values`
Indexing, iteration
DataFrame.head
[27]:
pd_flights.head()
[27]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 0 | 841.265642 | False | ... | 0 | 2018-01-01 00:00:00 | 
| 1 | 882.982662 | False | ... | 0 | 2018-01-01 18:27:00 | 
| 2 | 190.636904 | False | ... | 0 | 2018-01-01 17:11:14 | 
| 3 | 181.694216 | True | ... | 0 | 2018-01-01 10:33:28 | 
| 4 | 730.041778 | False | ... | 0 | 2018-01-01 05:13:00 | 
5 rows × 27 columns
[28]:
oml_flights.head()
[28]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 0 | 841.265642 | False | ... | 0 | 2018-01-01 00:00:00 | 
| 1 | 882.982662 | False | ... | 0 | 2018-01-01 18:27:00 | 
| 2 | 190.636904 | False | ... | 0 | 2018-01-01 17:11:14 | 
| 3 | 181.694216 | True | ... | 0 | 2018-01-01 10:33:28 | 
| 4 | 730.041778 | False | ... | 0 | 2018-01-01 05:13:00 | 
5 rows × 27 columns
DataFrame.tail
[29]:
pd_flights.tail()
[29]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 13054 | 1080.446279 | False | ... | 6 | 2018-02-11 20:42:25 | 
| 13055 | 646.612941 | False | ... | 6 | 2018-02-11 01:41:57 | 
| 13056 | 997.751876 | False | ... | 6 | 2018-02-11 04:09:27 | 
| 13057 | 1102.814465 | False | ... | 6 | 2018-02-11 08:28:21 | 
| 13058 | 858.144337 | False | ... | 6 | 2018-02-11 14:54:34 | 
5 rows × 27 columns
[30]:
oml_flights.tail()
[30]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 13054 | 1080.446279 | False | ... | 6 | 2018-02-11 20:42:25 | 
| 13055 | 646.612941 | False | ... | 6 | 2018-02-11 01:41:57 | 
| 13056 | 997.751876 | False | ... | 6 | 2018-02-11 04:09:27 | 
| 13057 | 1102.814465 | False | ... | 6 | 2018-02-11 08:28:21 | 
| 13058 | 858.144337 | False | ... | 6 | 2018-02-11 14:54:34 | 
5 rows × 27 columns
DataFrame.keys
[31]:
pd_flights.keys()
[31]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')
[32]:
oml_flights.keys()
[32]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')
DataFrame.get
[33]:
pd_flights.get('Carrier')
[33]:
0         Kibana Airlines
1        Logstash Airways
2        Logstash Airways
3         Kibana Airlines
4         Kibana Airlines
               ...
13054    Logstash Airways
13055    Logstash Airways
13056    Logstash Airways
13057            JetBeats
13058            JetBeats
Name: Carrier, Length: 13059, dtype: object
[34]:
oml_flights.get('Carrier')
[34]:
0         Kibana Airlines
1        Logstash Airways
2        Logstash Airways
3         Kibana Airlines
4         Kibana Airlines
               ...
13054    Logstash Airways
13055    Logstash Airways
13056    Logstash Airways
13057            JetBeats
13058            JetBeats
Name: Carrier, Length: 13059, dtype: object
[35]:
pd_flights.get(['Carrier', 'Origin'])
[35]:
| Carrier | Origin | |
|---|---|---|
| 0 | Kibana Airlines | Frankfurt am Main Airport | 
| 1 | Logstash Airways | Cape Town International Airport | 
| 2 | Logstash Airways | Venice Marco Polo Airport | 
| 3 | Kibana Airlines | Naples International Airport | 
| 4 | Kibana Airlines | Licenciado Benito Juarez International Airport | 
| ... | ... | ... | 
| 13054 | Logstash Airways | Pisa International Airport | 
| 13055 | Logstash Airways | Winnipeg / James Armstrong Richardson Internat... | 
| 13056 | Logstash Airways | Licenciado Benito Juarez International Airport | 
| 13057 | JetBeats | Itami Airport | 
| 13058 | JetBeats | Adelaide International Airport | 
13059 rows × 2 columns
List input not currently supported by opensearch_py_ml.DataFrame.get
[36]:
try:
    oml_flights.get(['Carrier', 'Origin'])
except TypeError as e:
    print(e)
unhashable type: 'list'
DataFrame.query
[37]:
pd_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[37]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 | 
| 26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 | 
| 311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 | 
| 651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 | 
| 950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 | 
| ... | ... | ... | ... | ... | ... | 
| 12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 | 
| 12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 | 
| 12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 | 
| 12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 | 
| 13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 | 
68 rows × 27 columns
opensearch_py_ml.DataFrame.query requires qualifier on bool i.e.
oml_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled') fails
[38]:
oml_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[38]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 | 
| 26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 | 
| 311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 | 
| 651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 | 
| 950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 | 
| ... | ... | ... | ... | ... | ... | 
| 12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 | 
| 12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 | 
| 12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 | 
| 12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 | 
| 13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 | 
68 rows × 27 columns
Boolean indexing query
[39]:
pd_flights[(pd_flights.Carrier=="Kibana Airlines") &
           (pd_flights.AvgTicketPrice > 900.0) &
           (pd_flights.Cancelled == True)]
[39]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 | 
| 26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 | 
| 311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 | 
| 651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 | 
| 950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 | 
| ... | ... | ... | ... | ... | ... | 
| 12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 | 
| 12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 | 
| 12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 | 
| 12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 | 
| 13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 | 
68 rows × 27 columns
[41]:
oml_flights[(oml_flights.Carrier=="Kibana Airlines") &
           (oml_flights.AvgTicketPrice > 900.0) &
           (oml_flights.Cancelled == True)]
[41]:
| AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 | 
| 26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 | 
| 311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 | 
| 651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 | 
| 950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 | 
| ... | ... | ... | ... | ... | ... | 
| 12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 | 
| 12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 | 
| 12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 | 
| 12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 | 
| 13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 | 
68 rows × 27 columns
Function application, GroupBy & window
DataFrame.aggs
[42]:
pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[42]:
| DistanceKilometers | AvgTicketPrice | |
|---|---|---|
| sum | 9.261629e+07 | 8.204365e+06 | 
| min | 0.000000e+00 | 1.000205e+02 | 
| std | 4.578438e+03 | 2.663969e+02 | 
opensearch_py_ml.DataFrame.aggregate currently only supported numeric columns
[43]:
oml_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[43]:
| DistanceKilometers | AvgTicketPrice | |
|---|---|---|
| sum | 9.261629e+07 | 8.204365e+06 | 
| min | 0.000000e+00 | 1.000205e+02 | 
| std | 4.578614e+03 | 2.664071e+02 | 
Computations / descriptive stats
DataFrame.count
[44]:
pd_flights.count()
[44]:
AvgTicketPrice    13059
Cancelled         13059
Carrier           13059
Dest              13059
DestAirportID     13059
                  ...
OriginLocation    13059
OriginRegion      13059
OriginWeather     13059
dayOfWeek         13059
timestamp         13059
Length: 27, dtype: int64
[45]:
oml_flights.count()
[45]:
AvgTicketPrice    13059
Cancelled         13059
Carrier           13059
Dest              13059
DestAirportID     13059
                  ...
OriginLocation    13059
OriginRegion      13059
OriginWeather     13059
dayOfWeek         13059
timestamp         13059
Length: 27, dtype: int64
DataFrame.describe
[46]:
pd_flights.describe()
[46]:
| AvgTicketPrice | DistanceKilometers | ... | FlightTimeMin | dayOfWeek | |
|---|---|---|---|---|---|
| count | 13059.000000 | 13059.000000 | ... | 13059.000000 | 13059.000000 | 
| mean | 628.253689 | 7092.142455 | ... | 511.127842 | 2.835975 | 
| std | 266.396861 | 4578.438497 | ... | 334.753952 | 1.939439 | 
| min | 100.020528 | 0.000000 | ... | 0.000000 | 0.000000 | 
| 25% | 409.893816 | 2459.705673 | ... | 252.333192 | 1.000000 | 
| 50% | 640.556668 | 7610.330866 | ... | 503.045170 | 3.000000 | 
| 75% | 842.185470 | 9736.637600 | ... | 720.416036 | 4.000000 | 
| max | 1199.729053 | 19881.482315 | ... | 1902.902032 | 6.000000 | 
8 rows × 7 columns
Values returned from opensearch_py_ml.DataFrame.describe may vary due to results of Opensearch aggregations.
[47]:
# NBVAL_IGNORE_OUTPUT
oml_flights.describe()
[47]:
| AvgTicketPrice | Cancelled | ... | FlightTimeMin | dayOfWeek | |
|---|---|---|---|---|---|
| count | 13059.000000 | 13059.000000 | ... | 13059.000000 | 13059.000000 | 
| mean | 628.253689 | 0.128494 | ... | 511.127842 | 2.835975 | 
| std | 266.407061 | 0.334664 | ... | 334.766770 | 1.939513 | 
| min | 100.020531 | 0.000000 | ... | 0.000000 | 0.000000 | 
| 25% | 410.008918 | 0.000000 | ... | 251.938710 | 1.000000 | 
| 50% | 640.387285 | 0.000000 | ... | 503.148975 | 3.000000 | 
| 75% | 842.213490 | 0.000000 | ... | 720.505705 | 4.000000 | 
| max | 1199.729004 | 1.000000 | ... | 1902.901978 | 6.000000 | 
8 rows × 9 columns
DataFrame.info
[48]:
pd_flights.info()
<class 'pandas.core.frame.DataFrame'>
Index: 13059 entries, 0 to 13058
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   AvgTicketPrice      13059 non-null  float64
 1   Cancelled           13059 non-null  bool
 2   Carrier             13059 non-null  object
 3   Dest                13059 non-null  object
 4   DestAirportID       13059 non-null  object
 5   DestCityName        13059 non-null  object
 6   DestCountry         13059 non-null  object
 7   DestLocation        13059 non-null  object
 8   DestRegion          13059 non-null  object
 9   DestWeather         13059 non-null  object
 10  DistanceKilometers  13059 non-null  float64
 11  DistanceMiles       13059 non-null  float64
 12  FlightDelay         13059 non-null  bool
 13  FlightDelayMin      13059 non-null  int64
 14  FlightDelayType     13059 non-null  object
 15  FlightNum           13059 non-null  object
 16  FlightTimeHour      13059 non-null  float64
 17  FlightTimeMin       13059 non-null  float64
 18  Origin              13059 non-null  object
 19  OriginAirportID     13059 non-null  object
 20  OriginCityName      13059 non-null  object
 21  OriginCountry       13059 non-null  object
 22  OriginLocation      13059 non-null  object
 23  OriginRegion        13059 non-null  object
 24  OriginWeather       13059 non-null  object
 25  dayOfWeek           13059 non-null  int64
 26  timestamp           13059 non-null  datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
memory usage: 3.1+ MB
[49]:
oml_flights.info()
<class 'opensearch_py_ml.dataframe.DataFrame'>
Index: 13059 entries, 0 to 13058
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   AvgTicketPrice      13059 non-null  float64
 1   Cancelled           13059 non-null  bool
 2   Carrier             13059 non-null  object
 3   Dest                13059 non-null  object
 4   DestAirportID       13059 non-null  object
 5   DestCityName        13059 non-null  object
 6   DestCountry         13059 non-null  object
 7   DestLocation        13059 non-null  object
 8   DestRegion          13059 non-null  object
 9   DestWeather         13059 non-null  object
 10  DistanceKilometers  13059 non-null  float64
 11  DistanceMiles       13059 non-null  float64
 12  FlightDelay         13059 non-null  bool
 13  FlightDelayMin      13059 non-null  int64
 14  FlightDelayType     13059 non-null  object
 15  FlightNum           13059 non-null  object
 16  FlightTimeHour      13059 non-null  float64
 17  FlightTimeMin       13059 non-null  float64
 18  Origin              13059 non-null  object
 19  OriginAirportID     13059 non-null  object
 20  OriginCityName      13059 non-null  object
 21  OriginCountry       13059 non-null  object
 22  OriginLocation      13059 non-null  object
 23  OriginRegion        13059 non-null  object
 24  OriginWeather       13059 non-null  object
 25  dayOfWeek           13059 non-null  int64
 26  timestamp           13059 non-null  datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
memory usage: 64.000 bytes
OpenSearch storage usage: 10.714 MB
DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum
max
[50]:
pd_flights.max(numeric_only=True)
[50]:
AvgTicketPrice         1199.729053
Cancelled                     True
DistanceKilometers    19881.482315
DistanceMiles         12353.780369
FlightDelay                   True
FlightDelayMin                 360
FlightTimeHour           31.715034
FlightTimeMin          1902.902032
dayOfWeek                        6
dtype: object
opensearch_py_ml.DataFrame.max,min,mean,sum only aggregate numeric columns
[51]:
oml_flights.max(numeric_only=True)
[51]:
AvgTicketPrice         1199.729004
Cancelled                 1.000000
DistanceKilometers    19881.482422
DistanceMiles         12353.780273
FlightDelay               1.000000
FlightDelayMin          360.000000
FlightTimeHour           31.715034
FlightTimeMin          1902.901978
dayOfWeek                 6.000000
dtype: float64
min
[52]:
pd_flights.min(numeric_only=True)
[52]:
AvgTicketPrice        100.020528
Cancelled                  False
DistanceKilometers           0.0
DistanceMiles                0.0
FlightDelay                False
FlightDelayMin                 0
FlightTimeHour               0.0
FlightTimeMin                0.0
dayOfWeek                      0
dtype: object
[53]:
oml_flights.min(numeric_only=True)
[53]:
AvgTicketPrice        100.020531
Cancelled               0.000000
DistanceKilometers      0.000000
DistanceMiles           0.000000
FlightDelay             0.000000
FlightDelayMin          0.000000
FlightTimeHour          0.000000
FlightTimeMin           0.000000
dayOfWeek               0.000000
dtype: float64
mean
[54]:
pd_flights.mean(numeric_only=True)
[54]:
AvgTicketPrice         628.253689
Cancelled                0.128494
DistanceKilometers    7092.142455
DistanceMiles         4406.853013
FlightDelay              0.251168
FlightDelayMin          47.335171
FlightTimeHour           8.518797
FlightTimeMin          511.127842
dayOfWeek                2.835975
dtype: float64
[55]:
oml_flights.mean(numeric_only=True)
[55]:
AvgTicketPrice         628.253689
Cancelled                0.128494
DistanceKilometers    7092.142457
DistanceMiles         4406.853010
FlightDelay              0.251168
FlightDelayMin          47.335171
FlightTimeHour           8.518797
FlightTimeMin          511.127842
dayOfWeek                2.835975
dtype: float64
sum
[56]:
pd_flights.sum(numeric_only=True)
[56]:
AvgTicketPrice        8.204365e+06
Cancelled             1.678000e+03
DistanceKilometers    9.261629e+07
DistanceMiles         5.754909e+07
FlightDelay           3.280000e+03
FlightDelayMin        6.181500e+05
FlightTimeHour        1.112470e+05
FlightTimeMin         6.674818e+06
dayOfWeek             3.703500e+04
dtype: float64
[57]:
oml_flights.sum(numeric_only=True)
[57]:
AvgTicketPrice        8.204365e+06
Cancelled             1.678000e+03
DistanceKilometers    9.261629e+07
DistanceMiles         5.754909e+07
FlightDelay           3.280000e+03
FlightDelayMin        6.181500e+05
FlightTimeHour        1.112470e+05
FlightTimeMin         6.674818e+06
dayOfWeek             3.703500e+04
dtype: float64
[58]:
pd_flights[['Carrier', 'Origin', 'Dest']].nunique()
[58]:
Carrier      4
Origin     156
Dest       156
dtype: int64
[59]:
oml_flights[['Carrier', 'Origin', 'Dest']].nunique()
[59]:
Carrier      4
Origin     156
Dest       156
dtype: int64
[60]:
pd_flights.drop(columns=['AvgTicketPrice',
                         'Cancelled',
                         'DestLocation',
                         'Dest',
                         'DestAirportID',
                         'DestCityName',
                         'DestCountry'])
[60]:
| Carrier | DestRegion | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 0 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 00:00:00 | 
| 1 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 18:27:00 | 
| 2 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 17:11:14 | 
| 3 | Kibana Airlines | IT-34 | ... | 0 | 2018-01-01 10:33:28 | 
| 4 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 05:13:00 | 
| ... | ... | ... | ... | ... | ... | 
| 13054 | Logstash Airways | SE-BD | ... | 6 | 2018-02-11 20:42:25 | 
| 13055 | Logstash Airways | CH-ZH | ... | 6 | 2018-02-11 01:41:57 | 
| 13056 | Logstash Airways | RU-AMU | ... | 6 | 2018-02-11 04:09:27 | 
| 13057 | JetBeats | SE-BD | ... | 6 | 2018-02-11 08:28:21 | 
| 13058 | JetBeats | US-DC | ... | 6 | 2018-02-11 14:54:34 | 
13059 rows × 20 columns
[61]:
oml_flights.drop(columns=['AvgTicketPrice',
                         'Cancelled',
                         'DestLocation',
                         'Dest',
                         'DestAirportID',
                         'DestCityName',
                         'DestCountry'])
[61]:
| Carrier | DestRegion | ... | dayOfWeek | timestamp | |
|---|---|---|---|---|---|
| 0 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 00:00:00 | 
| 1 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 18:27:00 | 
| 2 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 17:11:14 | 
| 3 | Kibana Airlines | IT-34 | ... | 0 | 2018-01-01 10:33:28 | 
| 4 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 05:13:00 | 
| ... | ... | ... | ... | ... | ... | 
| 13054 | Logstash Airways | SE-BD | ... | 6 | 2018-02-11 20:42:25 | 
| 13055 | Logstash Airways | CH-ZH | ... | 6 | 2018-02-11 01:41:57 | 
| 13056 | Logstash Airways | RU-AMU | ... | 6 | 2018-02-11 04:09:27 | 
| 13057 | JetBeats | SE-BD | ... | 6 | 2018-02-11 08:28:21 | 
| 13058 | JetBeats | US-DC | ... | 6 | 2018-02-11 14:54:34 | 
13059 rows × 20 columns
[62]:
pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()
[63]:
oml_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()
[64]:
oml_flights2 = oml_flights[(oml_flights.OriginAirportID == 'AMS') & (oml_flights.FlightDelayMin > 60)]
oml_flights2 = oml_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
oml_flights2 = oml_flights2.tail()
[65]:
print(oml_flights2.os_info())
os_index_pattern: flights
Index:
 os_index_field: _id
 is_source_field: False
Mappings:
 capabilities:
                   os_field_name  is_source os_dtype                  os_date_format        pd_dtype  is_searchable  is_aggregatable  is_scripted aggregatable_os_field_name
timestamp              timestamp       True     date  strict_date_hour_minute_second  datetime64[ns]           True             True        False                  timestamp
OriginAirportID  OriginAirportID       True  keyword                            None          object           True             True        False            OriginAirportID
DestAirportID      DestAirportID       True  keyword                            None          object           True             True        False              DestAirportID
FlightDelayMin    FlightDelayMin       True  integer                            None           int64           True             True        False             FlightDelayMin
Operations:
 tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
 size: 5
 sort_params: {'_doc': 'desc'}
 _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
 body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}
 post_processing: [('sort_index')]
[ ]: