Demo Notebook for Dataframe
Download notebook
Step 0: Imports
[2]:
%load_ext autoreload
%autoreload 2
[3]:
# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made
import warnings
warnings.filterwarnings('ignore')
[4]:
# imports to demonstrate DataFrame support
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import opensearch_py_ml as oml
from opensearchpy import OpenSearch
# Import standard test settings for consistent results
from opensearch_py_ml.conftest import *
Step 1: Setup clients
[5]:
CLUSTER_URL = 'https://localhost:9200'
def get_os_client(cluster_url = CLUSTER_URL,
username='admin',
password='< admin password >'):
'''
Get OpenSearch client
:param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
:return: OpenSearch client
'''
client = OpenSearch(
hosts=[cluster_url],
http_auth=(username, password),
verify_certs=False
)
return client
[6]:
client = get_os_client()
Comparing similarities and differences between panda dataframe vs opensearch_py_ml dataframe
Create an opensearch_py_ml.DataFrame from a flights
index
[7]:
oml_flights = oml.DataFrame(client, 'flights')
[8]:
type(oml_flights)
[8]:
opensearch_py_ml.dataframe.DataFrame
Compare to pandas DataFrame (created from the same data)
[9]:
pd_flights = oml.opensearch_to_pandas(oml_flights)
[10]:
type(pd_flights)
[10]:
pandas.core.frame.DataFrame
Attributes and underlying data
DataFrame.columns
[11]:
pd_flights.columns
[11]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
'timestamp'],
dtype='object')
[12]:
oml_flights.columns
[12]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
'timestamp'],
dtype='object')
DataFrame.dtypes
[13]:
pd_flights.dtypes
[13]:
AvgTicketPrice float64
Cancelled bool
Carrier object
Dest object
DestAirportID object
...
OriginLocation object
OriginRegion object
OriginWeather object
dayOfWeek int64
timestamp datetime64[ns]
Length: 27, dtype: object
[14]:
oml_flights.dtypes
[14]:
AvgTicketPrice float64
Cancelled bool
Carrier object
Dest object
DestAirportID object
...
OriginLocation object
OriginRegion object
OriginWeather object
dayOfWeek int64
timestamp datetime64[ns]
Length: 27, dtype: object
DataFrame.select_dtypes
[15]:
pd_flights.select_dtypes(include=np.number)
[15]:
AvgTicketPrice | DistanceKilometers | ... | FlightTimeMin | dayOfWeek | |
---|---|---|---|---|---|
0 | 841.265642 | 16492.326654 | ... | 1030.770416 | 0 |
1 | 882.982662 | 8823.400140 | ... | 464.389481 | 0 |
2 | 190.636904 | 0.000000 | ... | 0.000000 | 0 |
3 | 181.694216 | 555.737767 | ... | 222.749059 | 0 |
4 | 730.041778 | 13358.244200 | ... | 785.779071 | 0 |
... | ... | ... | ... | ... | ... |
13054 | 1080.446279 | 8058.581753 | ... | 402.929088 | 6 |
13055 | 646.612941 | 7088.598322 | ... | 644.418029 | 6 |
13056 | 997.751876 | 10920.652972 | ... | 937.540811 | 6 |
13057 | 1102.814465 | 18748.859647 | ... | 1697.404971 | 6 |
13058 | 858.144337 | 16809.141923 | ... | 1610.761827 | 6 |
13059 rows × 7 columns
[16]:
oml_flights.select_dtypes(include=np.number)
[16]:
AvgTicketPrice | DistanceKilometers | ... | FlightTimeMin | dayOfWeek | |
---|---|---|---|---|---|
0 | 841.265642 | 16492.326654 | ... | 1030.770416 | 0 |
1 | 882.982662 | 8823.400140 | ... | 464.389481 | 0 |
2 | 190.636904 | 0.000000 | ... | 0.000000 | 0 |
3 | 181.694216 | 555.737767 | ... | 222.749059 | 0 |
4 | 730.041778 | 13358.244200 | ... | 785.779071 | 0 |
... | ... | ... | ... | ... | ... |
13054 | 1080.446279 | 8058.581753 | ... | 402.929088 | 6 |
13055 | 646.612941 | 7088.598322 | ... | 644.418029 | 6 |
13056 | 997.751876 | 10920.652972 | ... | 937.540811 | 6 |
13057 | 1102.814465 | 18748.859647 | ... | 1697.404971 | 6 |
13058 | 858.144337 | 16809.141923 | ... | 1610.761827 | 6 |
13059 rows × 7 columns
DataFrame.empty
[17]:
pd_flights.empty
[17]:
False
[18]:
oml_flights.empty
[18]:
False
DataFrame.shape
[19]:
pd_flights.shape
[19]:
(13059, 27)
[20]:
oml_flights.shape
[20]:
(13059, 27)
DataFrame.index
Note, opensearch_py_ml.DataFrame.index
does not mirror pandas.DataFrame.index
.
[21]:
pd_flights.index
[21]:
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
...
'13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'],
dtype='object', length=13059)
[22]:
# NBVAL_IGNORE_OUTPUT
oml_flights.index
[22]:
<opensearch_py_ml.index.Index at 0x1131744c0>
[24]:
oml_flights.index.os_index_field
[24]:
'_id'
DataFrame.values
Note, opensearch_py_ml.DataFrame.values
is not supported.
[25]:
pd_flights.values
[25]:
array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0,
Timestamp('2018-01-01 00:00:00')],
[882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0,
Timestamp('2018-01-01 18:27:00')],
[190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0,
Timestamp('2018-01-01 17:11:14')],
...,
[997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6,
Timestamp('2018-02-11 04:09:27')],
[1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6,
Timestamp('2018-02-11 08:28:21')],
[858.1443369038839, False, 'JetBeats', ..., 'Rain', 6,
Timestamp('2018-02-11 14:54:34')]], dtype=object)
[26]:
try:
oml_flights.values
except AttributeError as e:
print(e)
This method would scan/scroll the entire OpenSearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.opensearch_to_pandas(ed_df).values`
Indexing, iteration
DataFrame.head
[27]:
pd_flights.head()
[27]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
0 | 841.265642 | False | ... | 0 | 2018-01-01 00:00:00 |
1 | 882.982662 | False | ... | 0 | 2018-01-01 18:27:00 |
2 | 190.636904 | False | ... | 0 | 2018-01-01 17:11:14 |
3 | 181.694216 | True | ... | 0 | 2018-01-01 10:33:28 |
4 | 730.041778 | False | ... | 0 | 2018-01-01 05:13:00 |
5 rows × 27 columns
[28]:
oml_flights.head()
[28]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
0 | 841.265642 | False | ... | 0 | 2018-01-01 00:00:00 |
1 | 882.982662 | False | ... | 0 | 2018-01-01 18:27:00 |
2 | 190.636904 | False | ... | 0 | 2018-01-01 17:11:14 |
3 | 181.694216 | True | ... | 0 | 2018-01-01 10:33:28 |
4 | 730.041778 | False | ... | 0 | 2018-01-01 05:13:00 |
5 rows × 27 columns
DataFrame.tail
[29]:
pd_flights.tail()
[29]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
13054 | 1080.446279 | False | ... | 6 | 2018-02-11 20:42:25 |
13055 | 646.612941 | False | ... | 6 | 2018-02-11 01:41:57 |
13056 | 997.751876 | False | ... | 6 | 2018-02-11 04:09:27 |
13057 | 1102.814465 | False | ... | 6 | 2018-02-11 08:28:21 |
13058 | 858.144337 | False | ... | 6 | 2018-02-11 14:54:34 |
5 rows × 27 columns
[30]:
oml_flights.tail()
[30]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
13054 | 1080.446279 | False | ... | 6 | 2018-02-11 20:42:25 |
13055 | 646.612941 | False | ... | 6 | 2018-02-11 01:41:57 |
13056 | 997.751876 | False | ... | 6 | 2018-02-11 04:09:27 |
13057 | 1102.814465 | False | ... | 6 | 2018-02-11 08:28:21 |
13058 | 858.144337 | False | ... | 6 | 2018-02-11 14:54:34 |
5 rows × 27 columns
DataFrame.keys
[31]:
pd_flights.keys()
[31]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
'timestamp'],
dtype='object')
[32]:
oml_flights.keys()
[32]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
'timestamp'],
dtype='object')
DataFrame.get
[33]:
pd_flights.get('Carrier')
[33]:
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
[34]:
oml_flights.get('Carrier')
[34]:
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
[35]:
pd_flights.get(['Carrier', 'Origin'])
[35]:
Carrier | Origin | |
---|---|---|
0 | Kibana Airlines | Frankfurt am Main Airport |
1 | Logstash Airways | Cape Town International Airport |
2 | Logstash Airways | Venice Marco Polo Airport |
3 | Kibana Airlines | Naples International Airport |
4 | Kibana Airlines | Licenciado Benito Juarez International Airport |
... | ... | ... |
13054 | Logstash Airways | Pisa International Airport |
13055 | Logstash Airways | Winnipeg / James Armstrong Richardson Internat... |
13056 | Logstash Airways | Licenciado Benito Juarez International Airport |
13057 | JetBeats | Itami Airport |
13058 | JetBeats | Adelaide International Airport |
13059 rows × 2 columns
List input not currently supported by opensearch_py_ml.DataFrame.get
[36]:
try:
oml_flights.get(['Carrier', 'Origin'])
except TypeError as e:
print(e)
unhashable type: 'list'
DataFrame.query
[37]:
pd_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[37]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 |
26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 |
311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 |
651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 |
950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 |
... | ... | ... | ... | ... | ... |
12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 |
12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 |
12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 |
12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 |
13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 |
68 rows × 27 columns
opensearch_py_ml.DataFrame.query
requires qualifier on bool i.e.
oml_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled')
fails
[38]:
oml_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[38]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 |
26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 |
311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 |
651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 |
950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 |
... | ... | ... | ... | ... | ... |
12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 |
12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 |
12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 |
12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 |
13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 |
68 rows × 27 columns
Boolean indexing query
[39]:
pd_flights[(pd_flights.Carrier=="Kibana Airlines") &
(pd_flights.AvgTicketPrice > 900.0) &
(pd_flights.Cancelled == True)]
[39]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 |
26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 |
311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 |
651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 |
950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 |
... | ... | ... | ... | ... | ... |
12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 |
12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 |
12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 |
12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 |
13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 |
68 rows × 27 columns
[41]:
oml_flights[(oml_flights.Carrier=="Kibana Airlines") &
(oml_flights.AvgTicketPrice > 900.0) &
(oml_flights.Cancelled == True)]
[41]:
AvgTicketPrice | Cancelled | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
8 | 960.869736 | True | ... | 0 | 2018-01-01 12:09:35 |
26 | 975.812632 | True | ... | 0 | 2018-01-01 15:38:32 |
311 | 946.358410 | True | ... | 0 | 2018-01-01 11:51:12 |
651 | 975.383864 | True | ... | 2 | 2018-01-03 21:13:17 |
950 | 907.836523 | True | ... | 2 | 2018-01-03 05:14:51 |
... | ... | ... | ... | ... | ... |
12820 | 909.973606 | True | ... | 5 | 2018-02-10 05:11:35 |
12906 | 983.429244 | True | ... | 6 | 2018-02-11 06:19:58 |
12918 | 1136.678150 | True | ... | 6 | 2018-02-11 16:03:10 |
12919 | 1105.211803 | True | ... | 6 | 2018-02-11 05:36:05 |
13013 | 1055.350213 | True | ... | 6 | 2018-02-11 13:20:16 |
68 rows × 27 columns
Function application, GroupBy & window
DataFrame.aggs
[42]:
pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[42]:
DistanceKilometers | AvgTicketPrice | |
---|---|---|
sum | 9.261629e+07 | 8.204365e+06 |
min | 0.000000e+00 | 1.000205e+02 |
std | 4.578438e+03 | 2.663969e+02 |
opensearch_py_ml.DataFrame.aggregate
currently only supported numeric columns
[43]:
oml_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[43]:
DistanceKilometers | AvgTicketPrice | |
---|---|---|
sum | 9.261629e+07 | 8.204365e+06 |
min | 0.000000e+00 | 1.000205e+02 |
std | 4.578614e+03 | 2.664071e+02 |
Computations / descriptive stats
DataFrame.count
[44]:
pd_flights.count()
[44]:
AvgTicketPrice 13059
Cancelled 13059
Carrier 13059
Dest 13059
DestAirportID 13059
...
OriginLocation 13059
OriginRegion 13059
OriginWeather 13059
dayOfWeek 13059
timestamp 13059
Length: 27, dtype: int64
[45]:
oml_flights.count()
[45]:
AvgTicketPrice 13059
Cancelled 13059
Carrier 13059
Dest 13059
DestAirportID 13059
...
OriginLocation 13059
OriginRegion 13059
OriginWeather 13059
dayOfWeek 13059
timestamp 13059
Length: 27, dtype: int64
DataFrame.describe
[46]:
pd_flights.describe()
[46]:
AvgTicketPrice | DistanceKilometers | ... | FlightTimeMin | dayOfWeek | |
---|---|---|---|---|---|
count | 13059.000000 | 13059.000000 | ... | 13059.000000 | 13059.000000 |
mean | 628.253689 | 7092.142455 | ... | 511.127842 | 2.835975 |
std | 266.396861 | 4578.438497 | ... | 334.753952 | 1.939439 |
min | 100.020528 | 0.000000 | ... | 0.000000 | 0.000000 |
25% | 409.893816 | 2459.705673 | ... | 252.333192 | 1.000000 |
50% | 640.556668 | 7610.330866 | ... | 503.045170 | 3.000000 |
75% | 842.185470 | 9736.637600 | ... | 720.416036 | 4.000000 |
max | 1199.729053 | 19881.482315 | ... | 1902.902032 | 6.000000 |
8 rows × 7 columns
Values returned from opensearch_py_ml.DataFrame.describe
may vary due to results of Opensearch aggregations.
[47]:
# NBVAL_IGNORE_OUTPUT
oml_flights.describe()
[47]:
AvgTicketPrice | Cancelled | ... | FlightTimeMin | dayOfWeek | |
---|---|---|---|---|---|
count | 13059.000000 | 13059.000000 | ... | 13059.000000 | 13059.000000 |
mean | 628.253689 | 0.128494 | ... | 511.127842 | 2.835975 |
std | 266.407061 | 0.334664 | ... | 334.766770 | 1.939513 |
min | 100.020531 | 0.000000 | ... | 0.000000 | 0.000000 |
25% | 410.008918 | 0.000000 | ... | 251.938710 | 1.000000 |
50% | 640.387285 | 0.000000 | ... | 503.148975 | 3.000000 |
75% | 842.213490 | 0.000000 | ... | 720.505705 | 4.000000 |
max | 1199.729004 | 1.000000 | ... | 1902.901978 | 6.000000 |
8 rows × 9 columns
DataFrame.info
[48]:
pd_flights.info()
<class 'pandas.core.frame.DataFrame'>
Index: 13059 entries, 0 to 13058
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AvgTicketPrice 13059 non-null float64
1 Cancelled 13059 non-null bool
2 Carrier 13059 non-null object
3 Dest 13059 non-null object
4 DestAirportID 13059 non-null object
5 DestCityName 13059 non-null object
6 DestCountry 13059 non-null object
7 DestLocation 13059 non-null object
8 DestRegion 13059 non-null object
9 DestWeather 13059 non-null object
10 DistanceKilometers 13059 non-null float64
11 DistanceMiles 13059 non-null float64
12 FlightDelay 13059 non-null bool
13 FlightDelayMin 13059 non-null int64
14 FlightDelayType 13059 non-null object
15 FlightNum 13059 non-null object
16 FlightTimeHour 13059 non-null float64
17 FlightTimeMin 13059 non-null float64
18 Origin 13059 non-null object
19 OriginAirportID 13059 non-null object
20 OriginCityName 13059 non-null object
21 OriginCountry 13059 non-null object
22 OriginLocation 13059 non-null object
23 OriginRegion 13059 non-null object
24 OriginWeather 13059 non-null object
25 dayOfWeek 13059 non-null int64
26 timestamp 13059 non-null datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
memory usage: 3.1+ MB
[49]:
oml_flights.info()
<class 'opensearch_py_ml.dataframe.DataFrame'>
Index: 13059 entries, 0 to 13058
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AvgTicketPrice 13059 non-null float64
1 Cancelled 13059 non-null bool
2 Carrier 13059 non-null object
3 Dest 13059 non-null object
4 DestAirportID 13059 non-null object
5 DestCityName 13059 non-null object
6 DestCountry 13059 non-null object
7 DestLocation 13059 non-null object
8 DestRegion 13059 non-null object
9 DestWeather 13059 non-null object
10 DistanceKilometers 13059 non-null float64
11 DistanceMiles 13059 non-null float64
12 FlightDelay 13059 non-null bool
13 FlightDelayMin 13059 non-null int64
14 FlightDelayType 13059 non-null object
15 FlightNum 13059 non-null object
16 FlightTimeHour 13059 non-null float64
17 FlightTimeMin 13059 non-null float64
18 Origin 13059 non-null object
19 OriginAirportID 13059 non-null object
20 OriginCityName 13059 non-null object
21 OriginCountry 13059 non-null object
22 OriginLocation 13059 non-null object
23 OriginRegion 13059 non-null object
24 OriginWeather 13059 non-null object
25 dayOfWeek 13059 non-null int64
26 timestamp 13059 non-null datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
memory usage: 64.000 bytes
OpenSearch storage usage: 10.714 MB
DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum
max
[50]:
pd_flights.max(numeric_only=True)
[50]:
AvgTicketPrice 1199.729053
Cancelled True
DistanceKilometers 19881.482315
DistanceMiles 12353.780369
FlightDelay True
FlightDelayMin 360
FlightTimeHour 31.715034
FlightTimeMin 1902.902032
dayOfWeek 6
dtype: object
opensearch_py_ml.DataFrame.max,min,mean,sum
only aggregate numeric columns
[51]:
oml_flights.max(numeric_only=True)
[51]:
AvgTicketPrice 1199.729004
Cancelled 1.000000
DistanceKilometers 19881.482422
DistanceMiles 12353.780273
FlightDelay 1.000000
FlightDelayMin 360.000000
FlightTimeHour 31.715034
FlightTimeMin 1902.901978
dayOfWeek 6.000000
dtype: float64
min
[52]:
pd_flights.min(numeric_only=True)
[52]:
AvgTicketPrice 100.020528
Cancelled False
DistanceKilometers 0.0
DistanceMiles 0.0
FlightDelay False
FlightDelayMin 0
FlightTimeHour 0.0
FlightTimeMin 0.0
dayOfWeek 0
dtype: object
[53]:
oml_flights.min(numeric_only=True)
[53]:
AvgTicketPrice 100.020531
Cancelled 0.000000
DistanceKilometers 0.000000
DistanceMiles 0.000000
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 0.000000
FlightTimeMin 0.000000
dayOfWeek 0.000000
dtype: float64
mean
[54]:
pd_flights.mean(numeric_only=True)
[54]:
AvgTicketPrice 628.253689
Cancelled 0.128494
DistanceKilometers 7092.142455
DistanceMiles 4406.853013
FlightDelay 0.251168
FlightDelayMin 47.335171
FlightTimeHour 8.518797
FlightTimeMin 511.127842
dayOfWeek 2.835975
dtype: float64
[55]:
oml_flights.mean(numeric_only=True)
[55]:
AvgTicketPrice 628.253689
Cancelled 0.128494
DistanceKilometers 7092.142457
DistanceMiles 4406.853010
FlightDelay 0.251168
FlightDelayMin 47.335171
FlightTimeHour 8.518797
FlightTimeMin 511.127842
dayOfWeek 2.835975
dtype: float64
sum
[56]:
pd_flights.sum(numeric_only=True)
[56]:
AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03
DistanceKilometers 9.261629e+07
DistanceMiles 5.754909e+07
FlightDelay 3.280000e+03
FlightDelayMin 6.181500e+05
FlightTimeHour 1.112470e+05
FlightTimeMin 6.674818e+06
dayOfWeek 3.703500e+04
dtype: float64
[57]:
oml_flights.sum(numeric_only=True)
[57]:
AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03
DistanceKilometers 9.261629e+07
DistanceMiles 5.754909e+07
FlightDelay 3.280000e+03
FlightDelayMin 6.181500e+05
FlightTimeHour 1.112470e+05
FlightTimeMin 6.674818e+06
dayOfWeek 3.703500e+04
dtype: float64
[58]:
pd_flights[['Carrier', 'Origin', 'Dest']].nunique()
[58]:
Carrier 4
Origin 156
Dest 156
dtype: int64
[59]:
oml_flights[['Carrier', 'Origin', 'Dest']].nunique()
[59]:
Carrier 4
Origin 156
Dest 156
dtype: int64
[60]:
pd_flights.drop(columns=['AvgTicketPrice',
'Cancelled',
'DestLocation',
'Dest',
'DestAirportID',
'DestCityName',
'DestCountry'])
[60]:
Carrier | DestRegion | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
0 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 00:00:00 |
1 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 18:27:00 |
2 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 17:11:14 |
3 | Kibana Airlines | IT-34 | ... | 0 | 2018-01-01 10:33:28 |
4 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 05:13:00 |
... | ... | ... | ... | ... | ... |
13054 | Logstash Airways | SE-BD | ... | 6 | 2018-02-11 20:42:25 |
13055 | Logstash Airways | CH-ZH | ... | 6 | 2018-02-11 01:41:57 |
13056 | Logstash Airways | RU-AMU | ... | 6 | 2018-02-11 04:09:27 |
13057 | JetBeats | SE-BD | ... | 6 | 2018-02-11 08:28:21 |
13058 | JetBeats | US-DC | ... | 6 | 2018-02-11 14:54:34 |
13059 rows × 20 columns
[61]:
oml_flights.drop(columns=['AvgTicketPrice',
'Cancelled',
'DestLocation',
'Dest',
'DestAirportID',
'DestCityName',
'DestCountry'])
[61]:
Carrier | DestRegion | ... | dayOfWeek | timestamp | |
---|---|---|---|---|---|
0 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 00:00:00 |
1 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 18:27:00 |
2 | Logstash Airways | IT-34 | ... | 0 | 2018-01-01 17:11:14 |
3 | Kibana Airlines | IT-34 | ... | 0 | 2018-01-01 10:33:28 |
4 | Kibana Airlines | SE-BD | ... | 0 | 2018-01-01 05:13:00 |
... | ... | ... | ... | ... | ... |
13054 | Logstash Airways | SE-BD | ... | 6 | 2018-02-11 20:42:25 |
13055 | Logstash Airways | CH-ZH | ... | 6 | 2018-02-11 01:41:57 |
13056 | Logstash Airways | RU-AMU | ... | 6 | 2018-02-11 04:09:27 |
13057 | JetBeats | SE-BD | ... | 6 | 2018-02-11 08:28:21 |
13058 | JetBeats | US-DC | ... | 6 | 2018-02-11 14:54:34 |
13059 rows × 20 columns
[62]:
pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()
[63]:
oml_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()
[64]:
oml_flights2 = oml_flights[(oml_flights.OriginAirportID == 'AMS') & (oml_flights.FlightDelayMin > 60)]
oml_flights2 = oml_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
oml_flights2 = oml_flights2.tail()
[65]:
print(oml_flights2.os_info())
os_index_pattern: flights
Index:
os_index_field: _id
is_source_field: False
Mappings:
capabilities:
os_field_name is_source os_dtype os_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_os_field_name
timestamp timestamp True date strict_date_hour_minute_second datetime64[ns] True True False timestamp
OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID
DestAirportID DestAirportID True keyword None object True True False DestAirportID
FlightDelayMin FlightDelayMin True integer None int64 True True False FlightDelayMin
Operations:
tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
size: 5
sort_params: {'_doc': 'desc'}
_source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}
post_processing: [('sort_index')]
[ ]: