Demo Notebook for Dataframe

Download notebook

Step 0: Imports

[2]:
%load_ext autoreload
%autoreload 2
[3]:
# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made
import warnings
warnings.filterwarnings('ignore')
[4]:
# imports to demonstrate DataFrame support
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import opensearch_py_ml as oml
from opensearchpy import OpenSearch

# Import standard test settings for consistent results
from opensearch_py_ml.conftest import *

Step 1: Setup clients

[5]:
CLUSTER_URL = 'https://localhost:9200'

def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='< admin password >'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client
[6]:
client = get_os_client()

Comparing similarities and differences between panda dataframe vs opensearch_py_ml dataframe

Create an opensearch_py_ml.DataFrame from a flights index

[7]:
oml_flights = oml.DataFrame(client, 'flights')
[8]:
type(oml_flights)
[8]:
opensearch_py_ml.dataframe.DataFrame

Compare to pandas DataFrame (created from the same data)

[9]:
pd_flights = oml.opensearch_to_pandas(oml_flights)
[10]:
type(pd_flights)
[10]:
pandas.core.frame.DataFrame

Attributes and underlying data

DataFrame.columns

[11]:
pd_flights.columns
[11]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')
[12]:
oml_flights.columns
[12]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')

DataFrame.dtypes

[13]:
pd_flights.dtypes
[13]:
AvgTicketPrice           float64
Cancelled                   bool
Carrier                   object
Dest                      object
DestAirportID             object
                       ...
OriginLocation            object
OriginRegion              object
OriginWeather             object
dayOfWeek                  int64
timestamp         datetime64[ns]
Length: 27, dtype: object
[14]:
oml_flights.dtypes
[14]:
AvgTicketPrice           float64
Cancelled                   bool
Carrier                   object
Dest                      object
DestAirportID             object
                       ...
OriginLocation            object
OriginRegion              object
OriginWeather             object
dayOfWeek                  int64
timestamp         datetime64[ns]
Length: 27, dtype: object

DataFrame.select_dtypes

[15]:
pd_flights.select_dtypes(include=np.number)
[15]:
AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek
0 841.265642 16492.326654 ... 1030.770416 0
1 882.982662 8823.400140 ... 464.389481 0
2 190.636904 0.000000 ... 0.000000 0
3 181.694216 555.737767 ... 222.749059 0
4 730.041778 13358.244200 ... 785.779071 0
... ... ... ... ... ...
13054 1080.446279 8058.581753 ... 402.929088 6
13055 646.612941 7088.598322 ... 644.418029 6
13056 997.751876 10920.652972 ... 937.540811 6
13057 1102.814465 18748.859647 ... 1697.404971 6
13058 858.144337 16809.141923 ... 1610.761827 6

13059 rows × 7 columns

[16]:
oml_flights.select_dtypes(include=np.number)
[16]:
AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek
0 841.265642 16492.326654 ... 1030.770416 0
1 882.982662 8823.400140 ... 464.389481 0
2 190.636904 0.000000 ... 0.000000 0
3 181.694216 555.737767 ... 222.749059 0
4 730.041778 13358.244200 ... 785.779071 0
... ... ... ... ... ...
13054 1080.446279 8058.581753 ... 402.929088 6
13055 646.612941 7088.598322 ... 644.418029 6
13056 997.751876 10920.652972 ... 937.540811 6
13057 1102.814465 18748.859647 ... 1697.404971 6
13058 858.144337 16809.141923 ... 1610.761827 6

13059 rows × 7 columns

DataFrame.empty

[17]:
pd_flights.empty
[17]:
False
[18]:
oml_flights.empty
[18]:
False

DataFrame.shape

[19]:
pd_flights.shape
[19]:
(13059, 27)
[20]:
oml_flights.shape
[20]:
(13059, 27)

DataFrame.index

Note, opensearch_py_ml.DataFrame.index does not mirror pandas.DataFrame.index.

[21]:
pd_flights.index
[21]:
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'],
      dtype='object', length=13059)
[22]:
# NBVAL_IGNORE_OUTPUT
oml_flights.index
[22]:
<opensearch_py_ml.index.Index at 0x1131744c0>
[24]:
oml_flights.index.os_index_field
[24]:
'_id'

DataFrame.values

Note, opensearch_py_ml.DataFrame.values is not supported.

[25]:
pd_flights.values
[25]:
array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0,
        Timestamp('2018-01-01 00:00:00')],
       [882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0,
        Timestamp('2018-01-01 18:27:00')],
       [190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0,
        Timestamp('2018-01-01 17:11:14')],
       ...,
       [997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6,
        Timestamp('2018-02-11 04:09:27')],
       [1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6,
        Timestamp('2018-02-11 08:28:21')],
       [858.1443369038839, False, 'JetBeats', ..., 'Rain', 6,
        Timestamp('2018-02-11 14:54:34')]], dtype=object)
[26]:
try:
    oml_flights.values
except AttributeError as e:
    print(e)
This method would scan/scroll the entire OpenSearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.opensearch_to_pandas(ed_df).values`

Indexing, iteration

DataFrame.head

[27]:
pd_flights.head()
[27]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
0 841.265642 False ... 0 2018-01-01 00:00:00
1 882.982662 False ... 0 2018-01-01 18:27:00
2 190.636904 False ... 0 2018-01-01 17:11:14
3 181.694216 True ... 0 2018-01-01 10:33:28
4 730.041778 False ... 0 2018-01-01 05:13:00

5 rows × 27 columns

[28]:
oml_flights.head()
[28]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
0 841.265642 False ... 0 2018-01-01 00:00:00
1 882.982662 False ... 0 2018-01-01 18:27:00
2 190.636904 False ... 0 2018-01-01 17:11:14
3 181.694216 True ... 0 2018-01-01 10:33:28
4 730.041778 False ... 0 2018-01-01 05:13:00

5 rows × 27 columns

DataFrame.tail

[29]:
pd_flights.tail()
[29]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
13054 1080.446279 False ... 6 2018-02-11 20:42:25
13055 646.612941 False ... 6 2018-02-11 01:41:57
13056 997.751876 False ... 6 2018-02-11 04:09:27
13057 1102.814465 False ... 6 2018-02-11 08:28:21
13058 858.144337 False ... 6 2018-02-11 14:54:34

5 rows × 27 columns

[30]:
oml_flights.tail()
[30]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
13054 1080.446279 False ... 6 2018-02-11 20:42:25
13055 646.612941 False ... 6 2018-02-11 01:41:57
13056 997.751876 False ... 6 2018-02-11 04:09:27
13057 1102.814465 False ... 6 2018-02-11 08:28:21
13058 858.144337 False ... 6 2018-02-11 14:54:34

5 rows × 27 columns

DataFrame.keys

[31]:
pd_flights.keys()
[31]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')
[32]:
oml_flights.keys()
[32]:
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
       'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
       'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
       'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
       'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
       'timestamp'],
      dtype='object')

DataFrame.get

[33]:
pd_flights.get('Carrier')
[33]:
0         Kibana Airlines
1        Logstash Airways
2        Logstash Airways
3         Kibana Airlines
4         Kibana Airlines
               ...
13054    Logstash Airways
13055    Logstash Airways
13056    Logstash Airways
13057            JetBeats
13058            JetBeats
Name: Carrier, Length: 13059, dtype: object
[34]:
oml_flights.get('Carrier')
[34]:
0         Kibana Airlines
1        Logstash Airways
2        Logstash Airways
3         Kibana Airlines
4         Kibana Airlines
               ...
13054    Logstash Airways
13055    Logstash Airways
13056    Logstash Airways
13057            JetBeats
13058            JetBeats
Name: Carrier, Length: 13059, dtype: object
[35]:
pd_flights.get(['Carrier', 'Origin'])
[35]:
Carrier Origin
0 Kibana Airlines Frankfurt am Main Airport
1 Logstash Airways Cape Town International Airport
2 Logstash Airways Venice Marco Polo Airport
3 Kibana Airlines Naples International Airport
4 Kibana Airlines Licenciado Benito Juarez International Airport
... ... ...
13054 Logstash Airways Pisa International Airport
13055 Logstash Airways Winnipeg / James Armstrong Richardson Internat...
13056 Logstash Airways Licenciado Benito Juarez International Airport
13057 JetBeats Itami Airport
13058 JetBeats Adelaide International Airport

13059 rows × 2 columns

List input not currently supported by opensearch_py_ml.DataFrame.get

[36]:
try:
    oml_flights.get(['Carrier', 'Origin'])
except TypeError as e:
    print(e)
unhashable type: 'list'

DataFrame.query

[37]:
pd_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[37]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
8 960.869736 True ... 0 2018-01-01 12:09:35
26 975.812632 True ... 0 2018-01-01 15:38:32
311 946.358410 True ... 0 2018-01-01 11:51:12
651 975.383864 True ... 2 2018-01-03 21:13:17
950 907.836523 True ... 2 2018-01-03 05:14:51
... ... ... ... ... ...
12820 909.973606 True ... 5 2018-02-10 05:11:35
12906 983.429244 True ... 6 2018-02-11 06:19:58
12918 1136.678150 True ... 6 2018-02-11 16:03:10
12919 1105.211803 True ... 6 2018-02-11 05:36:05
13013 1055.350213 True ... 6 2018-02-11 13:20:16

68 rows × 27 columns

opensearch_py_ml.DataFrame.query requires qualifier on bool i.e.

oml_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled') fails

[38]:
oml_flights.query('Carrier == "Kibana Airlines" & AvgTicketPrice > 900.0 & Cancelled == True')
[38]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
8 960.869736 True ... 0 2018-01-01 12:09:35
26 975.812632 True ... 0 2018-01-01 15:38:32
311 946.358410 True ... 0 2018-01-01 11:51:12
651 975.383864 True ... 2 2018-01-03 21:13:17
950 907.836523 True ... 2 2018-01-03 05:14:51
... ... ... ... ... ...
12820 909.973606 True ... 5 2018-02-10 05:11:35
12906 983.429244 True ... 6 2018-02-11 06:19:58
12918 1136.678150 True ... 6 2018-02-11 16:03:10
12919 1105.211803 True ... 6 2018-02-11 05:36:05
13013 1055.350213 True ... 6 2018-02-11 13:20:16

68 rows × 27 columns

Boolean indexing query

[39]:
pd_flights[(pd_flights.Carrier=="Kibana Airlines") &
           (pd_flights.AvgTicketPrice > 900.0) &
           (pd_flights.Cancelled == True)]
[39]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
8 960.869736 True ... 0 2018-01-01 12:09:35
26 975.812632 True ... 0 2018-01-01 15:38:32
311 946.358410 True ... 0 2018-01-01 11:51:12
651 975.383864 True ... 2 2018-01-03 21:13:17
950 907.836523 True ... 2 2018-01-03 05:14:51
... ... ... ... ... ...
12820 909.973606 True ... 5 2018-02-10 05:11:35
12906 983.429244 True ... 6 2018-02-11 06:19:58
12918 1136.678150 True ... 6 2018-02-11 16:03:10
12919 1105.211803 True ... 6 2018-02-11 05:36:05
13013 1055.350213 True ... 6 2018-02-11 13:20:16

68 rows × 27 columns

[41]:
oml_flights[(oml_flights.Carrier=="Kibana Airlines") &
           (oml_flights.AvgTicketPrice > 900.0) &
           (oml_flights.Cancelled == True)]
[41]:
AvgTicketPrice Cancelled ... dayOfWeek timestamp
8 960.869736 True ... 0 2018-01-01 12:09:35
26 975.812632 True ... 0 2018-01-01 15:38:32
311 946.358410 True ... 0 2018-01-01 11:51:12
651 975.383864 True ... 2 2018-01-03 21:13:17
950 907.836523 True ... 2 2018-01-03 05:14:51
... ... ... ... ... ...
12820 909.973606 True ... 5 2018-02-10 05:11:35
12906 983.429244 True ... 6 2018-02-11 06:19:58
12918 1136.678150 True ... 6 2018-02-11 16:03:10
12919 1105.211803 True ... 6 2018-02-11 05:36:05
13013 1055.350213 True ... 6 2018-02-11 13:20:16

68 rows × 27 columns

Function application, GroupBy & window

DataFrame.aggs

[42]:
pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[42]:
DistanceKilometers AvgTicketPrice
sum 9.261629e+07 8.204365e+06
min 0.000000e+00 1.000205e+02
std 4.578438e+03 2.663969e+02

opensearch_py_ml.DataFrame.aggregate currently only supported numeric columns

[43]:
oml_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
[43]:
DistanceKilometers AvgTicketPrice
sum 9.261629e+07 8.204365e+06
min 0.000000e+00 1.000205e+02
std 4.578614e+03 2.664071e+02

Computations / descriptive stats

DataFrame.count

[44]:
pd_flights.count()
[44]:
AvgTicketPrice    13059
Cancelled         13059
Carrier           13059
Dest              13059
DestAirportID     13059
                  ...
OriginLocation    13059
OriginRegion      13059
OriginWeather     13059
dayOfWeek         13059
timestamp         13059
Length: 27, dtype: int64
[45]:
oml_flights.count()
[45]:
AvgTicketPrice    13059
Cancelled         13059
Carrier           13059
Dest              13059
DestAirportID     13059
                  ...
OriginLocation    13059
OriginRegion      13059
OriginWeather     13059
dayOfWeek         13059
timestamp         13059
Length: 27, dtype: int64

DataFrame.describe

[46]:
pd_flights.describe()
[46]:
AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek
count 13059.000000 13059.000000 ... 13059.000000 13059.000000
mean 628.253689 7092.142455 ... 511.127842 2.835975
std 266.396861 4578.438497 ... 334.753952 1.939439
min 100.020528 0.000000 ... 0.000000 0.000000
25% 409.893816 2459.705673 ... 252.333192 1.000000
50% 640.556668 7610.330866 ... 503.045170 3.000000
75% 842.185470 9736.637600 ... 720.416036 4.000000
max 1199.729053 19881.482315 ... 1902.902032 6.000000

8 rows × 7 columns

Values returned from opensearch_py_ml.DataFrame.describe may vary due to results of Opensearch aggregations.

[47]:
# NBVAL_IGNORE_OUTPUT
oml_flights.describe()
[47]:
AvgTicketPrice Cancelled ... FlightTimeMin dayOfWeek
count 13059.000000 13059.000000 ... 13059.000000 13059.000000
mean 628.253689 0.128494 ... 511.127842 2.835975
std 266.407061 0.334664 ... 334.766770 1.939513
min 100.020531 0.000000 ... 0.000000 0.000000
25% 410.008918 0.000000 ... 251.938710 1.000000
50% 640.387285 0.000000 ... 503.148975 3.000000
75% 842.213490 0.000000 ... 720.505705 4.000000
max 1199.729004 1.000000 ... 1902.901978 6.000000

8 rows × 9 columns

DataFrame.info

[48]:
pd_flights.info()
<class 'pandas.core.frame.DataFrame'>
Index: 13059 entries, 0 to 13058
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   AvgTicketPrice      13059 non-null  float64
 1   Cancelled           13059 non-null  bool
 2   Carrier             13059 non-null  object
 3   Dest                13059 non-null  object
 4   DestAirportID       13059 non-null  object
 5   DestCityName        13059 non-null  object
 6   DestCountry         13059 non-null  object
 7   DestLocation        13059 non-null  object
 8   DestRegion          13059 non-null  object
 9   DestWeather         13059 non-null  object
 10  DistanceKilometers  13059 non-null  float64
 11  DistanceMiles       13059 non-null  float64
 12  FlightDelay         13059 non-null  bool
 13  FlightDelayMin      13059 non-null  int64
 14  FlightDelayType     13059 non-null  object
 15  FlightNum           13059 non-null  object
 16  FlightTimeHour      13059 non-null  float64
 17  FlightTimeMin       13059 non-null  float64
 18  Origin              13059 non-null  object
 19  OriginAirportID     13059 non-null  object
 20  OriginCityName      13059 non-null  object
 21  OriginCountry       13059 non-null  object
 22  OriginLocation      13059 non-null  object
 23  OriginRegion        13059 non-null  object
 24  OriginWeather       13059 non-null  object
 25  dayOfWeek           13059 non-null  int64
 26  timestamp           13059 non-null  datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
memory usage: 3.1+ MB
[49]:
oml_flights.info()
<class 'opensearch_py_ml.dataframe.DataFrame'>
Index: 13059 entries, 0 to 13058
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   AvgTicketPrice      13059 non-null  float64
 1   Cancelled           13059 non-null  bool
 2   Carrier             13059 non-null  object
 3   Dest                13059 non-null  object
 4   DestAirportID       13059 non-null  object
 5   DestCityName        13059 non-null  object
 6   DestCountry         13059 non-null  object
 7   DestLocation        13059 non-null  object
 8   DestRegion          13059 non-null  object
 9   DestWeather         13059 non-null  object
 10  DistanceKilometers  13059 non-null  float64
 11  DistanceMiles       13059 non-null  float64
 12  FlightDelay         13059 non-null  bool
 13  FlightDelayMin      13059 non-null  int64
 14  FlightDelayType     13059 non-null  object
 15  FlightNum           13059 non-null  object
 16  FlightTimeHour      13059 non-null  float64
 17  FlightTimeMin       13059 non-null  float64
 18  Origin              13059 non-null  object
 19  OriginAirportID     13059 non-null  object
 20  OriginCityName      13059 non-null  object
 21  OriginCountry       13059 non-null  object
 22  OriginLocation      13059 non-null  object
 23  OriginRegion        13059 non-null  object
 24  OriginWeather       13059 non-null  object
 25  dayOfWeek           13059 non-null  int64
 26  timestamp           13059 non-null  datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
memory usage: 64.000 bytes
OpenSearch storage usage: 10.714 MB

DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum

max

[50]:
pd_flights.max(numeric_only=True)
[50]:
AvgTicketPrice         1199.729053
Cancelled                     True
DistanceKilometers    19881.482315
DistanceMiles         12353.780369
FlightDelay                   True
FlightDelayMin                 360
FlightTimeHour           31.715034
FlightTimeMin          1902.902032
dayOfWeek                        6
dtype: object

opensearch_py_ml.DataFrame.max,min,mean,sum only aggregate numeric columns

[51]:
oml_flights.max(numeric_only=True)
[51]:
AvgTicketPrice         1199.729004
Cancelled                 1.000000
DistanceKilometers    19881.482422
DistanceMiles         12353.780273
FlightDelay               1.000000
FlightDelayMin          360.000000
FlightTimeHour           31.715034
FlightTimeMin          1902.901978
dayOfWeek                 6.000000
dtype: float64

min

[52]:
pd_flights.min(numeric_only=True)
[52]:
AvgTicketPrice        100.020528
Cancelled                  False
DistanceKilometers           0.0
DistanceMiles                0.0
FlightDelay                False
FlightDelayMin                 0
FlightTimeHour               0.0
FlightTimeMin                0.0
dayOfWeek                      0
dtype: object
[53]:
oml_flights.min(numeric_only=True)
[53]:
AvgTicketPrice        100.020531
Cancelled               0.000000
DistanceKilometers      0.000000
DistanceMiles           0.000000
FlightDelay             0.000000
FlightDelayMin          0.000000
FlightTimeHour          0.000000
FlightTimeMin           0.000000
dayOfWeek               0.000000
dtype: float64

mean

[54]:
pd_flights.mean(numeric_only=True)
[54]:
AvgTicketPrice         628.253689
Cancelled                0.128494
DistanceKilometers    7092.142455
DistanceMiles         4406.853013
FlightDelay              0.251168
FlightDelayMin          47.335171
FlightTimeHour           8.518797
FlightTimeMin          511.127842
dayOfWeek                2.835975
dtype: float64
[55]:
oml_flights.mean(numeric_only=True)
[55]:
AvgTicketPrice         628.253689
Cancelled                0.128494
DistanceKilometers    7092.142457
DistanceMiles         4406.853010
FlightDelay              0.251168
FlightDelayMin          47.335171
FlightTimeHour           8.518797
FlightTimeMin          511.127842
dayOfWeek                2.835975
dtype: float64

sum

[56]:
pd_flights.sum(numeric_only=True)
[56]:
AvgTicketPrice        8.204365e+06
Cancelled             1.678000e+03
DistanceKilometers    9.261629e+07
DistanceMiles         5.754909e+07
FlightDelay           3.280000e+03
FlightDelayMin        6.181500e+05
FlightTimeHour        1.112470e+05
FlightTimeMin         6.674818e+06
dayOfWeek             3.703500e+04
dtype: float64
[57]:
oml_flights.sum(numeric_only=True)
[57]:
AvgTicketPrice        8.204365e+06
Cancelled             1.678000e+03
DistanceKilometers    9.261629e+07
DistanceMiles         5.754909e+07
FlightDelay           3.280000e+03
FlightDelayMin        6.181500e+05
FlightTimeHour        1.112470e+05
FlightTimeMin         6.674818e+06
dayOfWeek             3.703500e+04
dtype: float64
[58]:
pd_flights[['Carrier', 'Origin', 'Dest']].nunique()
[58]:
Carrier      4
Origin     156
Dest       156
dtype: int64
[59]:
oml_flights[['Carrier', 'Origin', 'Dest']].nunique()
[59]:
Carrier      4
Origin     156
Dest       156
dtype: int64
[60]:
pd_flights.drop(columns=['AvgTicketPrice',
                         'Cancelled',
                         'DestLocation',
                         'Dest',
                         'DestAirportID',
                         'DestCityName',
                         'DestCountry'])
[60]:
Carrier DestRegion ... dayOfWeek timestamp
0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00
1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00
2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14
3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28
4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00
... ... ... ... ... ...
13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25
13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57
13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27
13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21
13058 JetBeats US-DC ... 6 2018-02-11 14:54:34

13059 rows × 20 columns

[61]:
oml_flights.drop(columns=['AvgTicketPrice',
                         'Cancelled',
                         'DestLocation',
                         'Dest',
                         'DestAirportID',
                         'DestCityName',
                         'DestCountry'])
[61]:
Carrier DestRegion ... dayOfWeek timestamp
0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00
1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00
2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14
3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28
4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00
... ... ... ... ... ...
13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25
13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57
13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27
13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21
13058 JetBeats US-DC ... 6 2018-02-11 14:54:34

13059 rows × 20 columns

[62]:
pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()
../_images/examples_demo_notebook_98_0.png
[63]:
oml_flights.select_dtypes(include=np.number).hist(figsize=[10,10])
plt.show()
../_images/examples_demo_notebook_99_0.png
[64]:
oml_flights2 = oml_flights[(oml_flights.OriginAirportID == 'AMS') & (oml_flights.FlightDelayMin > 60)]
oml_flights2 = oml_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
oml_flights2 = oml_flights2.tail()
[65]:
print(oml_flights2.os_info())
os_index_pattern: flights
Index:
 os_index_field: _id
 is_source_field: False
Mappings:
 capabilities:
                   os_field_name  is_source os_dtype                  os_date_format        pd_dtype  is_searchable  is_aggregatable  is_scripted aggregatable_os_field_name
timestamp              timestamp       True     date  strict_date_hour_minute_second  datetime64[ns]           True             True        False                  timestamp
OriginAirportID  OriginAirportID       True  keyword                            None          object           True             True        False            OriginAirportID
DestAirportID      DestAirportID       True  keyword                            None          object           True             True        False              DestAirportID
FlightDelayMin    FlightDelayMin       True  integer                            None           int64           True             True        False             FlightDelayMin
Operations:
 tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
 size: 5
 sort_params: {'_doc': 'desc'}
 _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
 body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}
 post_processing: [('sort_index')]

[ ]: