import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.formula.api as ssf
import statsmodels.api as sm
import os
from datetime import datetime, date
import folium
from folium import plugins
from branca.element import Figure
from math import radians, cos, sin, asin, sqrt

# Change according to the location of your datasets
sg_directory = r'C:\Users\alif8\Documents\NUS\Case Competitions\NUS Data Analytics Competition 2022\grab-posis-city=Singapore\city=Singapore'
jkt_directory = r'C:\Users\alif8\Documents\NUS\Case Competitions\NUS Data Analytics Competition 2022\grab-posis-city=Jakarta\city=Jakarta'

# Latitude & Longitude of Singapore
sg_lat = 1.290270
sg_lng = 103.851959
# Latitude & Longitude of Jakarta
jkt_lat = -6.2088
jkt_lng = 106.8456


# Note: Code was obtained from NUS SDS Workshop

def to_category(df, *args):
    for col_name in args:
        df[col_name] = df[col_name].astype("category")
    
def to_float32(df, *args):
    for col_name in args:
        df[col_name] = df[col_name].astype("float32")
        
def to_uint16(df, *args):
    for col_name in args:
        df[col_name] = df[col_name].astype("uint16")
  
def to_int32(df, *args):
    for col_name in args:
      df[col_name] = df[col_name].astype("int32")

def format_datetime(df, col_name):
    # Get datetime obj for all timestamps
    dt = df[col_name].apply(datetime.fromtimestamp)
    
    df["time"] = dt.apply(lambda x: x.time())
    df["hour"] = dt.apply(lambda x: x.hour)
    df["day_of_week"] = dt.apply(lambda x: x.weekday())
    df["month"] = dt.apply(lambda x: x.month)
    df["year"] = dt.apply(lambda x: x.year)


# Merges all dataset into one DataFrame, take note it will have a long runtime
# Input is either sg_directory or jkt_directory
def file_combine(directory):
    dataset = []
    for filename in os.listdir(directory):
        if filename.endswith(".parquet"):
            next_df = pd.read_parquet(directory + r'/' + filename)
            # Alternative configuration to filter by car/motorcycle for Jakarta dataset
            #next_df =pd.read_parquet(directory + r'/' + filename,  filters=[('driving_mode','=','car')])
            dataset.append(next_df)
    return pd.concat(dataset, axis=0, ignore_index=True)

df = file_combine(sg_directory)

df_sample = pd.read_parquet(sg_directory + r'/' + 'part-00000-8bbff892-97d2-4011-9961-703e38972569.c000.snappy.parquet')

# Can change line below to df_fixed = df_sample.copy() to replicate analysis with the sample instead
df_fixed = df.copy()
format_datetime(df_fixed, "pingtimestamp")
to_category(df_fixed, ["trj_id", "driving_mode", "osname"])
to_float32(df_fixed, ["rawlat", "rawlng", "speed", "accuracy"])
to_uint16(df_fixed, ["bearing", "day_of_week", "month", "year"])
to_int32(df_fixed, "pingtimestamp")
df_fixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30329685 entries, 0 to 30329684
Data columns (total 14 columns):
 #   Column         Dtype   
---  ------         -----   
 0   trj_id         category
 1   driving_mode   category
 2   osname         category
 3   pingtimestamp  int32   
 4   rawlat         float32 
 5   rawlng         float32 
 6   speed          float32 
 7   bearing        uint16  
 8   accuracy       float32 
 9   time           object  
 10  hour           int64   
 11  day_of_week    uint16  
 12  month          uint16  
 13  year           uint16  
dtypes: category(3), float32(4), int32(1), int64(1), object(1), uint16(4)
memory usage: 1.4+ GB


len(df_fixed['trj_id'].unique())

28000


start_points = df_fixed.groupby(['trj_id'])['time'].transform(min) == df_fixed['time']
start_points
df_start = df_fixed.loc[start_points].copy()

df_start.rename(columns={
    'rawlat': 'start_lat', 
    'rawlng': 'start_lng'}, inplace=True)

end_points = df_fixed.groupby(['trj_id'])['time'].transform(max) == df_fixed['time']
df_end = df_fixed.loc[end_points].copy()
df_end = df_end[['trj_id', 'rawlat', 'rawlng']]
print(df_end)
df_end.rename(columns={
    'rawlat': 'end_lat', 
    'rawlng': 'end_lng'}, inplace=True)

df_end_time = df_fixed.loc[end_points].copy()
df_end_time = df_end_time[['trj_id', 'time']]
df_end_time.rename(columns = {
    'time': 'end_time'}, inplace=True)

print(df_end_time)

mean_speeds = df_fixed.groupby(['trj_id']).agg(
    mean_speed = pd.NamedAgg(column='speed', aggfunc=lambda x: sum(x) / len(x)))

df_start_end = pd.merge(df_start, df_end, how='left', on='trj_id')
df_start_end = pd.merge(df_start_end, df_end_time, how='left', on='trj_id')
df_start_end = pd.merge(df_start_end, mean_speeds, how='left', on='trj_id')

         trj_id    rawlat      rawlng
1262       3671  1.299857  103.855202
2553      65747  1.375112  103.875252
4314      83162  1.329180  103.841675
8421      75831  1.392944  103.904289
9120      29097  1.281594  103.860527
...         ...       ...         ...
30323879  59810  1.431720  103.769547
30325290  71537  1.327284  103.841484
30326220  73362  1.355363  103.736534
30328387  63513  1.342762  103.746452
30329180  53168  1.343377  103.838226

[28000 rows x 3 columns]
         trj_id  end_time
1262       3671  11:14:17
2553      65747  20:32:32
4314      83162  09:37:58
8421      75831  19:44:55
9120      29097  09:40:41
...         ...       ...
30323879  59810  01:14:54
30325290  71537  09:47:42
30326220  73362  07:30:28
30328387  63513  08:48:01
30329180  53168  12:51:41

[28000 rows x 2 columns]


class Trip:
    def __init__(self, trj_id, start_lat, start_lng, end_lat, end_lng, mean_speed):
        self.trj_id = trj_id
        self.start_lat = start_lat
        self.start_lng = start_lng
        self.end_lat = end_lat
        self.end_lng = end_lng
        self.mean_speed = mean_speed
    
    def check(self):
        return "test"

def trip_constructor(row):
    return Trip(row['trj_id'], row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng'], row['mean_speed'])

df_start_end['trip_obj'] = df_start_end.apply(lambda row: trip_constructor(row), axis=1)


def distance(lat1, lat2, lon1, lon2):
     
    # The math module contains a function named radians which converts from degrees to radians.
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371
      
    # calculate the result
    return(c * r)

def same_start_end(trip1, trip2):
    # Threshold(in km) can be adjusted as needed
    threshold = 0.1
    start_same = abs(distance(
        trip1.start_lat, 
        trip2.start_lat,
        trip1.start_lng,
        trip2.start_lng)) <= threshold
    end_same = abs(distance(
        trip1.end_lat, 
        trip2.end_lat,
        trip1.end_lng,
        trip2.end_lng)) <= threshold
    return start_same and end_same


# Select time of day to draw pairs of trips from
hours = [10,]
df_sample_sample = df_start_end[df_start_end.hour.isin(hours)]
trip_array = df_sample_sample['trip_obj'].to_list()

comparable_trips = []

for i in range(len(trip_array)):
    for j in range(i, len(trip_array)):
        if trip_array[i] != trip_array[j]:
            if same_start_end(trip_array[i], trip_array[j]):
                comparable_trips += [[trip_array[i], trip_array[j]]]

comparable_trips_id = list(map(lambda x: [x[0].trj_id, x[1].trj_id], comparable_trips))
comparable_trips_id

# Notable points that we have found:
# 17hrs [0]
# 8hrs [10]
# 10 hrs [0]

[['79033', '67609'], ['28599', '68684'], ['1423', '76740'], ['49137', '79675']]


# Select test pair here
test_pair = comparable_trips_id[1]
# Alternatively, if there any two trips we want to manually, compare, can also set test_pair to the respective ids of those trips
#test_pair = ['59076','76917']
test_1 = test_pair[0]
test_2 = test_pair[1]

df_test = df_start_end['trj_id'].isin([test_1, test_2])
df_start_end[df_test]
df_first_col = df_start_end[df_test].head(1)


# Note: Code for first 2 functions was obtained from NUS SDS Workshop

def get_route(df, trj_id):
    return df.query('trj_id == ' + f"'{trj_id}'").sort_values("pingtimestamp")[["rawlat", "rawlng"]]

def get_start_end_pos(trj):
  return (trj.iloc[0, :], trj.iloc[-1, :])

def get_mean_speed(df, trj_id):
  return round(df.query('trj_id == ' + f"'{trj_id}'")['mean_speed'].values[0], 2)


route_1 = get_route(df_fixed, test_1)
route_2 = get_route(df_fixed, test_2)
route_1_mean_speed = get_mean_speed(df_start_end, test_1)
route_2_mean_speed = get_mean_speed(df_start_end, test_2)

info_1 = f"ID: {test_1}\nAvg Speed: {route_1_mean_speed}"
info_2 = f"ID: {test_2}\nAvg Speed: {route_2_mean_speed}"

fig_trj = Figure(height = 550, width = 750)

m_trj = folium.Map(location = [sg_lat, sg_lng],
                tiles = 'cartodbpositron', zoom_start = 11,
                min_zoom = 11, max_zoom = 18)
fig_trj.add_child(m_trj)

f1 = folium.FeatureGroup(f"{test_1}")
f2 = folium.FeatureGroup(f"{test_2}")

line_1 = folium.vector_layers.PolyLine(route_1.values,
                                       popup = '<b>Path of Vehicle_1</b>',
                                       tooltip = f"{info_1}",
                                       color = 'blue', weight = 1).add_to(f1)                                  
line_2 = folium.vector_layers.PolyLine(route_2.values,
                                       popup = '<b>Path of Vehicle_2</b>',
                                       tooltip = f"{info_2}",
                                       color = 'red', weight = 1).add_to(f2)
f1.add_to(m_trj)
f2.add_to(m_trj)

route_1_positions = get_start_end_pos(route_1)
route_2_positions = get_start_end_pos(route_2)

folium.Marker(
    location = [route_1_positions[1][0], route_1_positions[1][1]],
    popup = f"{info_1}",
    icon = folium.Icon(color = "blue"),
).add_to(f1)

folium.Marker(
    location = [route_2_positions[0][0], route_2_positions[0][1]],
    popup = f"{info_2}",
    icon = folium.Icon(color = "red"),
).add_to(f2)

folium.LayerControl().add_to(m_trj)

m_trj


# Threshold represents the radius (in km) around the selected destination to collect trips from
threshold = 2

# Coordinates of selected destinations and areas
# Singapore
cck = [1.3840, 103.7470]
orchard = [1.3048, 103.8318]
nus = [1.2966, 103.7764]
changi_airport = [1.3644, 103.9915]
city_hall = [1.2931, 103.8520]
bukit_panjang = [1.3774, 103.7719]
amk = [1.3691, 103.8454]

# Jakarta
menteng_CBD = [-6.1960, 106.8331]
jkt_airport = [-6.1271, 106.6535]
kemang_village = [-6.2778, 106.8114]
gambir = [-6.1703, 106.8148]
grand_indonesia_mall = [-6.1952, 106.8204]

# Represented as [start, end]
test_pair_start = [amk, city_hall]

# Helper functions to find similar start and end points
def helper_start(a,b,c):
    return distance(a['start_lat'],b,a['start_lng'],c)
def helper_end(a,b,c):
    return distance(a['end_lat'],b,a['end_lng'],c)

travel_time = df_start_end.apply(lambda x: datetime.combine(date.min, x['end_time']) - 
                                 datetime.combine(date.min, x['time']), axis = 1)

boolean_start = df_start_end.apply(lambda x: helper_start(x,test_pair_start[0][0],
                                         test_pair_start[0][1]) < threshold, axis = 1)
boolean_end = df_start_end.apply(lambda x: helper_end(x,test_pair_start[1][0],
                                         test_pair_start[1][1]) < threshold, axis = 1)
travel_time.name = 'travel_time'
boolean_start.name = 'start_bool'
boolean_end.name = 'end_bool'
result = pd.concat([df_start_end, travel_time], axis=1)
result = pd.concat([result, boolean_start], axis=1)
result = pd.concat([result, boolean_end], axis=1)
result_filtered = result[result.start_bool & result.end_bool]
result_filtered.head(5)


# List of lists of lists, where the outermost list represents trip start hour and the innermost list represents the coordinates of a ping
time_pings = [[] for i in range(0, 24)]
# List of lists, where each inner list represents the coordinates of a ping
nontime_pings = []

# Helper function that adds to both time_pings and nontime_pings simultaneously
def add_pings(row):
    trj_id_key = row['trj_id']
    hours = row['hour']
    global time_pings
    global nontime_pings
    time_pings[hours] += get_route(df_fixed, trj_id_key).values.tolist()
    nontime_pings += get_route(df_fixed, trj_id_key).values.tolist()
    
result_filtered.apply(lambda row: add_pings(row), axis=1)

118      None
1795     None
2041     None
2191     None
2534     None
3352     None
3516     None
4932     None
5412     None
5817     None
5918     None
6612     None
6674     None
8498     None
8550     None
8690     None
11049    None
11717    None
12032    None
12230    None
13245    None
13975    None
14274    None
14313    None
14582    None
15276    None
15483    None
16673    None
16852    None
17245    None
18272    None
18841    None
19130    None
19383    None
19642    None
20456    None
21134    None
21913    None
22731    None
23780    None
24124    None
25321    None
25389    None
25398    None
25906    None
26925    None
27140    None
27378    None
27531    None
27681    None
27807    None
dtype: object


# Gradient of HeatMap
gradient = {'0':'Navy', '0.25':'Blue','0.5':'Green', '0.75':'Yellow','1': 'Red'}
# Alternative gradient setting
# gradient={.2:'Yellow', .5:'Green', .75: 'Pink',1: 'Yellow'}

fig_heatmap = Figure(height = 550, width = 750)

# Creating map
map_heatmap = folium.Map([sg_lat, sg_lng], zoom_start = 11,
                             min_zoom = 11, max_zoom = 16)
fig_heatmap.add_child(map_heatmap)

# Creating HeatMap
plugins.HeatMap(nontime_pings, radius = 4, blur = 5, gradient=gradient).add_to(map_heatmap)

map_heatmap


fig_heatmaptime = Figure(height = 550, width = 750)

# Creating map
map_heatmaptime = folium.Map([sg_lat, sg_lng], zoom_start = 11,
                             min_zoom = 11, max_zoom = 16,
                             tiles = "cartodbpositron")
fig_heatmaptime.add_child(map_heatmaptime)

# Alternative method of visualising HeatMapWithTime, instead of going by start time hour, collate into peak hours and non-peak hours
new_time_pings = [[],[]]
new_time_pings[0] = time_pings[7:10][0] + time_pings[17:20][0]
new_time_pings[1] = time_pings[:7][0] + time_pings[10:17][0] + time_pings[20:][0]

# Creating HeatMapWithTime
plugins.HeatMapWithTime(time_pings, radius = 6, gradient=gradient).add_to(map_heatmaptime)

map_heatmaptime


# Display details of test_1 and test_2 that was set previously
result_display = result['trj_id'].isin([test_1, test_2])
result[result_display][["trj_id",'mean_speed',"time","end_time","travel_time","day_of_week"]]


results_df = result_filtered[["trj_id",'mean_speed',"time","end_time","travel_time","day_of_week"]]
results_df.sort_values(by = ['travel_time'])

# Alternatively, we could also sort by mean_speed
#results_df.sort_values(by = ['mean_speed'])


(results_df['travel_time'].astype('timedelta64[s]') / 60).plot.hist()

<AxesSubplot:ylabel='Frequency'>


results_df['mean_speed'].plot.hist()

<AxesSubplot:ylabel='Frequency'>

	trj_id	driving_mode	osname	pingtimestamp	start_lat	start_lng	speed	bearing	accuracy	time	...	month	year	end_lat	end_lng	end_time	mean_speed	trip_obj	travel_time	start_bool	end_bool
118	79954	car	ios	1554860420	1.376810	103.847389	1.190000	109	5.000	09:40:20	...	4	2019	1.301427	103.842964	09:58:37	10.439263	<__main__.Trip object at 0x0000019E76B78490>	0 days 00:18:17	True	True
1795	77812	car	android	1554938856	1.375896	103.856216	14.306968	80	4.842	07:27:36	...	4	2019	1.285942	103.861633	07:45:28	11.585155	<__main__.Trip object at 0x0000019EC45B0C10>	0 days 00:17:52	True	True
2041	59247	car	ios	1554768550	1.360194	103.843506	1.820000	310	10.000	08:09:10	...	4	2019	1.294737	103.844154	08:27:58	10.215965	<__main__.Trip object at 0x0000019EC3C1BAF0>	0 days 00:18:48	True	True
2191	76917	car	android	1555581245	1.362472	103.856041	8.320000	170	50.000	17:54:05	...	4	2019	1.278994	103.859413	18:23:14	5.865573	<__main__.Trip object at 0x0000019EC2E11790>	0 days 00:29:09	True	True
2534	46166	car	android	1555291918	1.376211	103.834267	4.311130	230	4.000	09:31:58	...	4	2019	1.309097	103.846489	09:52:05	10.304956	<__main__.Trip object at 0x0000019EC2AA98E0>	0 days 00:20:07	True	True

	trj_id	mean_speed	time	end_time	travel_time	day_of_week
4597	28599	14.356527	10:40:48	10:57:42	0 days 00:16:54	3
14312	68684	14.090349	10:59:06	11:15:59	0 days 00:16:53	4

	trj_id	mean_speed	time	end_time	travel_time	day_of_week
16852	74029	11.550527	13:50:26	14:05:04	0 days 00:14:38	3
19383	68000	10.501318	08:43:27	08:58:18	0 days 00:14:51	2
3516	77235	11.940429	08:15:42	08:30:44	0 days 00:15:02	2
20456	62931	12.372028	15:32:37	15:47:41	0 days 00:15:04	3
23780	64768	12.349755	06:58:40	07:14:02	0 days 00:15:22	4
21134	27202	12.431247	12:00:50	12:16:13	0 days 00:15:23	0
6612	59302	12.982696	08:40:29	08:55:53	0 days 00:15:24	0
25398	73155	13.562712	17:15:13	17:30:44	0 days 00:15:31	3
15483	4642	10.400150	12:15:39	12:31:38	0 days 00:15:59	5
26925	29092	11.403490	13:41:04	13:57:35	0 days 00:16:31	2
5918	32409	10.423652	10:04:11	10:21:08	0 days 00:16:57	2
14313	32717	10.348051	08:36:01	08:53:17	0 days 00:17:16	2
14582	812	8.980774	09:46:24	10:03:50	0 days 00:17:26	1
4932	64543	12.019767	17:20:13	17:37:42	0 days 00:17:29	0
12230	63638	12.105873	12:59:07	13:16:44	0 days 00:17:37	3
1795	77812	11.585155	07:27:36	07:45:28	0 days 00:17:52	3
18841	71391	9.970573	08:15:01	08:32:55	0 days 00:17:54	2
19642	79581	12.105595	09:04:33	09:22:35	0 days 00:18:02	3
6674	69970	7.627218	09:37:58	09:56:09	0 days 00:18:11	0
27531	67353	7.658016	11:52:45	12:10:59	0 days 00:18:14	3
11049	76827	11.922940	09:00:50	09:19:05	0 days 00:18:15	4
118	79954	10.439263	09:40:20	09:58:37	0 days 00:18:17	2
27681	752	12.002246	08:59:33	09:17:56	0 days 00:18:23	5
27378	16453	13.500848	07:33:57	07:52:30	0 days 00:18:33	0
2041	59247	10.215965	08:09:10	08:27:58	0 days 00:18:48	1
27140	38586	10.713324	13:38:17	13:57:16	0 days 00:18:59	5
21913	913	9.870799	08:13:56	08:32:58	0 days 00:19:02	0
13245	32556	9.780340	09:53:19	10:12:49	0 days 00:19:30	3
25389	48588	8.956023	10:01:17	10:21:11	0 days 00:19:54	0
19130	74556	9.195931	09:24:43	09:44:39	0 days 00:19:56	2
2534	46166	10.304956	09:31:58	09:52:05	0 days 00:20:07	0
8498	51213	10.550349	17:40:35	18:00:46	0 days 00:20:11	6
11717	77564	11.638251	18:29:32	18:49:59	0 days 00:20:27	5
24124	59063	9.101774	09:34:58	09:55:30	0 days 00:20:32	1
13975	75945	9.207201	14:09:38	14:30:11	0 days 00:20:33	4
8550	35649	10.248823	18:10:11	18:31:03	0 days 00:20:52	2
22731	77712	9.506851	08:29:28	08:50:34	0 days 00:21:06	0
8690	63876	9.275410	08:42:44	09:04:00	0 days 00:21:16	3
25321	71405	12.988158	17:28:11	17:49:28	0 days 00:21:17	3
16673	78915	11.425962	10:05:30	10:26:59	0 days 00:21:29	2
14274	14443	12.939217	08:47:40	09:09:18	0 days 00:21:38	3
15276	5652	9.737339	07:31:30	07:53:44	0 days 00:22:14	0
18272	51004	8.951242	09:26:35	09:49:10	0 days 00:22:35	2
5817	60673	7.488158	09:43:07	10:05:52	0 days 00:22:45	1
5412	76277	12.521320	17:02:20	17:25:20	0 days 00:23:00	1
3352	69998	10.273710	10:05:03	10:28:17	0 days 00:23:14	2
17245	62254	10.444783	10:01:13	10:24:29	0 days 00:23:16	5
25906	68624	9.434911	09:31:03	09:55:00	0 days 00:23:57	4
27807	69106	7.437001	09:42:45	10:08:44	0 days 00:25:59	4
2191	76917	5.865573	17:54:05	18:23:14	0 days 00:29:09	3
12032	59076	7.405844	07:09:01	07:49:30	0 days 00:40:29	0

NUS Data Analytics Competition 2022¶

Analysis of Grab-Posisi Dataset¶

Prepared by:¶

Problem Statement¶

1. Data Cleansing & Transformation¶

2. Exploratory Data Analysis¶

3. Visualisation¶

4. Conclusion¶