In week 3, I will do visual data analysis and feature construction. First, I will build and analyze several signs related to the time of visiting the site, then you can come up with and describe various signs.

Task plan:

  • Building features
  • Visual data analysis
  • Verification of constructed features
  • Further construction of features
import os
import time
import pickle
import re

import pandas as pd
import numpy as np
import scipy.sparse as sps
import seaborn as sns

from glob import glob
from scipy.sparse import csr_matrix
from datetime import timedelta
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")

Building features

PATH_TO_DATA = '/content/drive/MyDrive/DATA/Stepik/capstone_user_identification'

I will create a new one based on the functions prepare_train_set and prepare_sparse_train_set_windowprepare_train_set_with_fe, (from "feature engineering"), I will create the following signs:

  • session_timespan - session duration (the difference between the maximum and minimum time of visiting sites in a session, in seconds)
  • unique_sites – the number of unique sites in the session
  • start_hour - the hour of the start of the session (that is, the hour in the record of the minimum timestamp among ten)
  • day_of_week – the day of the week (that is, the day of the week in the record of the minimum timestamp among ten)

The function should return a new Data Frame (as the function prepare_train_set returned), only there should be 4 more signs. The order in which the signs are added: site1, ... site 10, session_timespan, unique_suites, start_hour, day_of_week and user_id (this can be seen below by the way the function is called).

def prepare_train_set_with_fe(path_to_csv_files, site_freq_path, feature_names,
                                    session_length=10, window_size=10):
  stock_files = sorted(glob(path_to_csv_files))
  
  #create a shared dataframe with all users and sites
  df = pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

  #read the file with the site name, identification number and frequency
  with open(site_freq_path, "rb") as fp:
    df_site_dict = pickle.load(fp)
  
  #create number list site
  data = []
  list_all_site = []
  #user = 0
  for filename in stock_files:
      user = int(filename[-8:-4])
      tmp_df = pd.read_csv(filename)
      tmp_df['timestamp'] = pd.to_datetime(tmp_df['timestamp'])
      start_hour_site = list(tmp_df['timestamp'].dt.hour)
      day_of_week_site = list(tmp_df['timestamp'].dt.weekday)
      list_site = []
      #session timespan in second for each user
      list_time = list((tmp_df.iloc[:, 0].shift(-1)-tmp_df.iloc[:, 0])/np.timedelta64(1,'s'))[:-1]
      #round list
      list_time = [round(x) for x in list_time]
      for site in tmp_df.site:
          list_site.append(df_site_dict.get(site)[0])
      count = 0
      #iterating over the beginning of the window depending on its width
      for start in range(0, (len(list_site) + window_size), window_size):
          ind_1 = start
          ind_2 = start + session_length #parameter for the condition
          sess_uniq = [] #number of unique sites
          if ind_2 <= (len(list_site)-1):
              sess = list_site[ind_1 : ind_2]
              sess_time = list_time[ind_1 : ind_2-1]
              sess_uniq = list(filter(lambda num: num != 0, sess))
              data.append(sess + sess_time + [sum(sess_time)] + [len(set(sess_uniq))] + [start_hour_site[start]] + 
                          [day_of_week_site[start]] + [user])
          elif(len(list_site[ind_1:]) !=0):
              sess = list_site[ind_1:] + [0 for _ in range(session_length - len(list_site[ind_1:]))]
              sess_uniq = list(filter(lambda num: num != 0, sess))
              sess_time = list_time[ind_1:] + [0 for _ in range(session_length - len(list_time[ind_1:])-1)]
              data.append(sess + sess_time + [sum(sess_time)] + [len(set(sess_uniq))] + [start_hour_site[start]] + 
                          [day_of_week_site[start]] + [user])
      #user = user + 1
  
  return pd.DataFrame(data, columns=feature_names)

Let's check the function on a toy example.

t_start = time.time()

feature_names = ['site' + str(i) for i in range(1,11)] + \
                ['time_diff' + str(j) for j in range(1,10)] + \
                ['session_timespan', '#unique_sites', 'start_hour', 
                 'day_of_week', 'target']
train_data_toy  = prepare_train_set_with_fe(os.path.join(PATH_TO_DATA, 
                                                         '3users/*.csv'), 
                  site_freq_path=os.path.join(PATH_TO_DATA, 
                                              'site_freq_3users.pkl'),
                  feature_names=feature_names, session_length=10)

print("Time elapsed", time.time() - t_start)
Time elapsed 2.094503402709961
train_data_toy
site1 site2 site3 site4 site5 site6 site7 site8 site9 site10 time_diff1 time_diff2 time_diff3 time_diff4 time_diff5 time_diff6 time_diff7 time_diff8 time_diff9 session_timespan #unique_sites start_hour day_of_week target
0 3 2 2 7 2 1 8 5 9 10 287 1184 6278 186 2 1 2 3 55 7998 8 9 4 1
1 3 1 1 1 0 0 0 0 0 0 2 3 55 0 0 0 0 0 0 60 2 12 4 1
2 3 2 6 6 2 0 0 0 0 0 287 1184 6278 186 0 0 0 0 0 7935 3 9 4 2
3 4 1 2 1 2 1 1 5 11 4 287 1184 6278 186 2 1 2 3 55 7998 5 9 4 3
4 4 1 2 0 0 0 0 0 0 0 287 1184 0 0 0 0 0 0 0 1471 3 12 4 3

I will apply the function prepare_train_set_with_fe to data for 10 users, I will specify session_length=10.

start = time.time()

train_data_10users = prepare_train_set_with_fe(os.path.join(PATH_TO_DATA, 
                                                         '10users/*.csv'), 
                  site_freq_path=os.path.join(PATH_TO_DATA, 
                                              'site_freq_10users.pkl'),
                  feature_names=feature_names, session_length=10)

end = time.time()

print("Time elapsed", timedelta(seconds=end-start))
Time elapsed 0:00:10.294475
train_data_10users.head()
site1 site2 site3 site4 site5 site6 site7 site8 site9 site10 time_diff1 time_diff2 time_diff3 time_diff4 time_diff5 time_diff6 time_diff7 time_diff8 time_diff9 session_timespan #unique_sites start_hour day_of_week target
0 192 574 133 3 133 133 3 133 203 133 10 0 0 1 20 1 0 1 0 33 5 8 4 31
1 415 193 674 254 133 31 393 3305 217 55 1 0 163 105 0 1 3 3 8 284 10 8 4 31
2 55 3 55 55 5 293 415 333 897 55 0 14 1 242 0 0 1 0 0 258 7 8 4 31
3 473 3306 473 55 55 55 55 937 199 123 2 1 0 1 25 1 0 0 0 30 6 8 4 31
4 342 55 5 3307 258 211 3308 2086 675 2086 1 0 1 1 1 0 1 1 0 6 9 8 4 31

I will apply the function prepare_train_set_with_fe to the data for 150 users, I will specify session_length=10.

start = time.time()

train_data_150users = prepare_train_set_with_fe(os.path.join(PATH_TO_DATA, 
                                                         '150users/*.csv'), 
                  site_freq_path=os.path.join(PATH_TO_DATA, 
                                              'site_freq_150users.pkl'),
                  feature_names=feature_names, session_length=10)

end = time.time()

print("Time elapsed", timedelta(seconds=end-start))
Time elapsed 0:01:42.615568

I will save the signs session_timespan, #unique_sites, start_hour and day_of_week to pickle files for 10 and 150 users.

new_feature_names = ['time_diff' + str(j) for j in range(1,10)] + \
                    ['session_timespan', '#unique_sites', 'start_hour','day_of_week']
new_features_10users = train_data_10users[new_feature_names]
new_features_150users = train_data_150users[new_feature_names]
with open(os.path.join(PATH_TO_DATA, 
                       'new_features_10users.pkl'), 'wb') as new_features_10users_pkl:
    pickle.dump(new_features_10users, new_features_10users_pkl)
with open(os.path.join(PATH_TO_DATA, 
                       'new_features_150users.pkl'), 'wb') as new_features_150users_pkl:
    pickle.dump(new_features_150users, new_features_150users_pkl)

Visual data analysis

For fun, for fun, we will give users names and associate colors with them.

id_name_dict = {128: 'Mary-Kate', 39: 'Ashley', 207: 'Lindsey', 127: 'Naomi', 237: 'Avril',
               33: 'Bob', 50: 'Bill', 31: 'John', 100: 'Dick', 241: 'Ed'}
train_data_10users['target'] = train_data_10users['target'].map(id_name_dict)
color_dic = {'Mary-Kate': 'pink', 'Ashley': 'darkviolet', 'Lindsey':'blueviolet', 
             'Naomi': 'hotpink', 'Avril': 'orchid', 
             'Bob': 'firebrick', 'Bill': 'gold', 'John': 'forestgreen', 
             'Dick': 'slategrey', 'Ed':'brown'}

1. I will build a histogram of the distribution of the session length in seconds (session_timespan). I will limit x to the value 200 (otherwise the tail is too heavy). I will make a histogram of the color dark violet.

x = train_data_10users[train_data_10users.session_timespan < 200].session_timespan
fig, ax = plt.subplots(figsize=(15, 5))
sns.distplot(x, bins=100, kde=False, color='darkviolet', hist_kws=dict(edgecolor="k", linewidth=2))
plt.title("Session length distribution")
plt.xlabel("Session length in seconds")
plt.ylabel("Frequency")
plt.show()

2. I will build a histogram of the distribution of the number of unique sites in the session (#unique_sites). I will make a histogram of the aqua color.

ax =sns.catplot(x="#unique_sites", kind="count", color='aqua', data=train_data_10users, height=5, aspect=2.5)
plt.title("Distribution of the number of unique sites", fontsize=12)
plt.xlabel("Number", fontsize=10)
plt.ylabel("Frequency", fontsize=10)
plt.show()

3. I will build histograms of the distribution of the number of unique sites in the session (#unique_sites) for each of the 10 users individually. I use subplots to place all 10 images on one big one. I will mark each picture with a legend, the user name will be written on the legend. For each user, I will color the histogram with his/her color (color_dic).

list_user = list(color_dic.keys())

fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))
set_user = set(train_data_10users.target)

user = 1
for idx, ax in enumerate(axes):
    for i, a in enumerate(ax):
        sns.countplot(train_data_10users[train_data_10users.target == list_user[user-1]]["#unique_sites"],color=color_dic[list_user[user-1]], ax=axes[idx, i])
        axes[idx, i].set_xlabel('Number of unique sites')
        axes[idx, i].set_ylabel('Frequency')
        axes[idx, i].legend([list_user[user-1]])
        user = user + 1

4. I will build a histogram of the distribution of the session start hour (start_hour). I will make a histogram of the darkgreen color.

ax =sns.catplot(x="start_hour", kind="count", color='darkgreen', data=train_data_10users,
                order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
                height=5, aspect=2.5)
plt.title("Allocation of the session start hour", fontsize=12)
plt.xlabel("Hour", fontsize=10)
plt.ylabel("Frequency", fontsize=10)
plt.show()

5. I will build histograms of the distribution of the session start hour (start_hour) for each of the 10 users individually. Use subplots to place all 10 images on one big one. Mark each picture with a legend, the user name should be written on the legend. For each user, color the histogram with his/her color (color_dic). Sign the axes in Russian in each of the 10 histogramsю

fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))
set_user = set(train_data_10users.target)

user = 1
for idx, ax in enumerate(axes):
    for i, a in enumerate(ax):
        sns.countplot(train_data_10users[train_data_10users.target == list_user[user-1]]["start_hour"],
                      color=color_dic[list_user[user-1]], 
                      order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
                      ax=axes[idx, i])
        axes[idx, i].set_xlabel('Session start hour')
        axes[idx, i].set_ylabel('Frequency')
        axes[idx, i].legend([list_user[user-1]])
        user = user + 1

6. I will build a histogram of the distribution of the day of the week on which the session started (day_of_week). I will make a histogram of the sienna color.

ax =sns.catplot(x="day_of_week", kind="count", color='sienna', data=train_data_10users, height=5, aspect=2.5)
plt.title("Distribution of the week in which the session started", fontsize=12)
plt.xlabel("Day of the week", fontsize=10)
plt.ylabel("Frequency", fontsize=10)
plt.show()

7. I will build histograms of the distribution of the day of the week on which the session started (day_of_week) for each of the 10 users individually. I use subplots to place all 10 images on one big one. I will change the labels on the X axis to ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - the set_xticklabels method. I will mark each picture with a legend, the user name will be written on the legend. For each user, I will color the histogram with his/her color (color_dic).

fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))

user = 1
for idx, ax in enumerate(axes):
    for i, a in enumerate(ax):
        sns.countplot(train_data_10users[train_data_10users.target == list_user[user-1]]["day_of_week"],
                      color=color_dic[list_user[user-1]], order = [0, 1, 2, 3, 4, 5, 6], ax=axes[idx, i])
        axes[idx, i].set_xlabel('Day of the week of the session start')
        axes[idx, i].set_ylabel('Frequency')
        axes[idx, i].legend([list_user[user-1]])
        axes[idx, i].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat',  'Sun'])
        user = user+1

8. Conclusions about each user according to the constructed graphs.

  1. 'Mary-Kate' - the frequency of the number of unique sites coincides with the general trend, the start time is an hour later than the usual one, and the start day of the session usually fell at the end of the working week with a maximum on Friday, although it is more common to start on Wednesdays
  2. 'Ashley' - the frequency of the number of unique sites is slightly lower from the general trend, the start time is in the morning with a maximum at 10 am, and the start day of the session usually fell at the end of the working week with a maximum on Friday, as with the last user, although it is more common to start on Wednesdays
  3. 'Lindsey' - the frequency of the number of unique sites coincides with the general trend. It was also typical for one more unique site. The start time usually came from 8 to 10 o'clock. The day of the beginning of the session is a pronounced Wednesday compared to other days.
  4. 'Naomi' - the frequency of the number of unique sites is slightly shifted towards an increase compared to the general trend. The start time is usually more often the same was an hour later. The day of the beginning of the session is pronounced Monday with a drop to Wednesday. The rest of the days are at the minimum level.
  5. 'Avril' - the frequency of the number of unique sites is characteristic of 2. In the general trend, this amount also stands out slightly. slightly shifted in the direction of increase compared to the general trend. It was also typical for one more. The distribution of the start time usually coincides with the general picture, but in contrast has a more pronounced tail in the evening. The start day is Wednesday or Saturday.
  6. 'Bob' is characterized by a slightly reduced number of unique sites. The frequency of the start time coincides with the maximum total start time of 14 hours. The day of the week is more typical Thursday. The remaining days at the beginning are approximately equal.
  7. 'Bill' is most characteristic of 2 unique sites, and then there is a downgrade. The start time is most distinguished from 9 to 10 in the morning and from 19 to 21 in the evening. The most typical start days are weekends.
  8. 'John' - the frequency of a large number of unique sites coincides with the general trend and is a maximum of 7. The most pronounced hours are 12 and, as with the general trend, 14. The start days of the week are Tuesday and Wednesday with a maximum on Wednesday.
  9. 'Dick' - the number of unique sites is one less than in the general trend. The hour of the start is maximum at 13 and 14, with a maximum at 14 as with the total. The days of the week are Tuesday, Wednesday, Thursday and Sunday. The rest are two times smaller and almost equivalent.
  10. 'Ed' - the histogram of the number of unique sites coincides with the general histogram. The start time is shifted up by 16. The maximum is on Wednesday, both at general, and on Saturday and Sunday.The rest of the days are not pronounced.

I will upload the frequency dictionary of sites for 10 users saved earlier in the pickle file.

with open(os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'), 'rb') as fid:
    site_freq_10 = pickle.load(fid)

I will determine the top 10 most visited sites (top 10_sites) and the corresponding number of visits (top 10_frags).

ten_best = list(site_freq_10.keys())[:10]
for site in list(ten_best):
    print(site)
s.youtube.com
www.google.fr
www.google.com
mail.google.com
www.facebook.com
apis.google.com
r3---sn-gxo5uxg-jqbe.googlevideo.com
r1---sn-gxo5uxg-jqbe.googlevideo.com
plus.google.com
accounts.google.com
feature_names_two = ['site' + str(i) for i in range(1,11)]
train_data_10users_array = train_data_10users[feature_names_two].values
unique, counts = np.unique(train_data_10users_array, return_counts=True)
ten_max_site = {}
for u, c in dict(zip(unique, counts)).items():
    if u == 0:
        continue
    elif u > 10:
        break
    else:
        ten_max_site[ten_best[u-1]] = c
top10_freqs = list(ten_max_site.values())
top10_sites = list(ten_max_site.keys())
df_top10sites = pd.DataFrame({'Top 10 sites' : top10_sites, 'Top freqs' : top10_freqs})
df_top10sites
Top 10 sites Top freqs
0 s.youtube.com 8300
1 www.google.fr 7813
2 www.google.com 5441
3 mail.google.com 4158
4 www.facebook.com 4141
5 apis.google.com 3758
6 r3---sn-gxo5uxg-jqbe.googlevideo.com 3244
7 r1---sn-gxo5uxg-jqbe.googlevideo.com 3094
8 plus.google.com 2630
9 accounts.google.com 2089

9. I will draw a seaborn barplot showing the frequency of visits to the top 10 sites. I will make the site signatures vertical, otherwise they merge (xticks).

fig, ax = plt.subplots(figsize=(15, 5))
ax = sns.barplot(x=top10_sites, y=top10_freqs)
plt.xticks(rotation = 90)
plt.show()

Checking the constructed features

This part is rather technical, its meaning is to make sure that we have all created the session_timespan, #unique_sites, start_hour and day_of_week attributes equally.

10. I will output the median session duration (session_timespan) for sessions of 10 users.

np.median(train_data_10users['session_timespan'])
37.0

11. I will output the median day of the week on which the session started for sessions of 10 users.

np.median(train_data_10users['day_of_week'])
2.0

12. I will output the median session start hour for sessions of 150 users.

np.median(train_data_150users['start_hour'])
13.0

13. I will output the median value of the number of unique sites in the sessions of 150 users.

np.median(train_data_150users['#unique_sites'])
7.0

Further construction of features

This is a creative task, here you need to figure out how else to take into account the time of visiting web pages and other signs.

Next week we will use a "bag" of sites to classify sessions by belonging to different users, and we will add these new features that we will create now and see if the model improves. Therefore, you can create them as separate matrices and save them separately as well.

In this part of the task, you can build and visually explore a variety of signs (nothing limits your imagination):

  • year, month and day of the session start
  • the hour of the session start (taking into account the year, month and day) time of day
  • average time spent on the site, you can calculate, for example, for the top 30 popular sites
  • indicators of visits to popular sites (say, also for the top 30 popular sites)
  • frequency of visits to Facebook
  • ...

I will write a function to create new features and apply it to the source data - directories with 10 and 150 files. I will do this only for the dataset obtained with the parameters session_length=10 and window_size=10. I serialize the resulting matrices using pickle. The function can return both new signs only, and old ones with new ones. At the same time, the signature of the function may be different – there is already freedom of my choice.

def feature_engineering(train_data):
    df_new = train_data.copy()
    #let's introduce a binary indicator 0 - weekdays, 1 - weekends
    df_new['weekday'] = df_new['day_of_week'].apply(lambda x: 1 if x > 4 else 0)
    #were there any top 10 sites in this session
    site_col_list = ['site' + str(i) for i in range(1,11)]
    df_new['popular'] = df_new[df_new[site_col_list]<11].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)
    #number of popular sites
    df_new['count_popular'] = 10 - df_new[site_col_list][df_new[site_col_list]<11].isna().sum(axis=1)
    #time spent on popular websites
    time_diff_num = ['time_diff' + str(i) for i in range(1,10)]
    
    row = 0
    rows = []
    cols = []
    data = []

    
    tmp_arr = np.nan_to_num(np.array(df_new[site_col_list][df_new[site_col_list]<11]))
    for arr in tmp_arr:
        for key, value in enumerate(arr):
            if value != 0:
                rows.append(row)
                cols.append(key)
                data.append(value)
        row = row + 1
    
    
    row_count = -1
    sum_time = 0
    sum_time_sess = [0]*df_new.shape[0]
    for r, c in zip(rows, cols):
        if(c==9):
            continue        
        elif r==row_count:
            sum_time_sess[row_count] = sum_time_sess[row_count] + df_new.loc[r][c+10]
        else:
            sum_time_sess[r] = df_new.loc[r][c+10]
            row_count = r
    df_new['time_popular'] = sum_time_sess
    return(df_new)
new_futures = feature_engineering(train_data_10users)

14. I will build pictures for new signs, examine them, comment on the results.

ax =sns.catplot(x="start_hour", kind="count", hue="weekday", data=new_futures,
                order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
                height=5, aspect=2.5)

# title
new_title = 'Day of the week'
ax._legend.set_title(new_title)
# replace labels
new_labels = ['Weekdays','Weekends']
for t, l in zip(ax._legend.texts, new_labels): t.set_text(l)

#sns.plt.show()
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))

user = 1
for idx, ax in enumerate(axes):
    for i, a in enumerate(ax):
        sns.countplot(new_futures[(new_futures.target == list_user[user-1])]["start_hour"], hue = 'weekday',
                      data = new_futures,
                      order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
                      ax=axes[idx, i])
        axes[idx, i].set_xlabel('Час начала сессии ')
        axes[idx, i].set_ylabel('Частота')
        axes[idx, i].legend(['Будни', 'Выходные'], title = list_user[user-1])
        user = user + 1

Consideration of the time distribution depending on whether it is weekends or weekdays brought additional characteristics for users. Three users either do not go online at all, or use the computer very little on weekends, surfing the Internet on weekends is significantly reduced for four, the remaining three have almost no changes, only two are characterized by a shift in activity.

fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))

user = 1
for idx, ax in enumerate(axes):
    for i, a in enumerate(ax):
        sns.countplot(new_futures[(new_futures.target == list_user[user-1])]["start_hour"], hue = 'popular',
                      data = new_futures,
                      order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
                      ax=axes[idx, i])
        axes[idx, i].set_xlabel('Час начала сессии ')
        axes[idx, i].set_ylabel('Частота')
        axes[idx, i].legend(['Популярные', 'Остальные'], title = list_user[user-1])
        user = user + 1

For four users, the frequency of visits to popular sites coincides with the rest, although some have a shift in the estimate for the morning hours. Three users have an increase at lunchtime from 13 to 15 hours.

new_futures.head()
site1 site2 site3 site4 site5 site6 site7 site8 site9 site10 time_diff1 time_diff2 time_diff3 time_diff4 time_diff5 time_diff6 time_diff7 time_diff8 time_diff9 session_timespan #unique_sites start_hour day_of_week target weekday popular count_popular time_popular
0 192 574 133 3 133 133 3 133 203 133 10 0 0 1 20 1 0 1 0 33 5 8 4 John 0 1 2 1
1 415 193 674 254 133 31 393 3305 217 55 1 0 163 105 0 1 3 3 8 284 10 8 4 John 0 0 0 0
2 55 3 55 55 5 293 415 333 897 55 0 14 1 242 0 0 1 0 0 258 7 8 4 John 0 1 2 14
3 473 3306 473 55 55 55 55 937 199 123 2 1 0 1 25 1 0 0 0 30 6 8 4 John 0 0 0 0
4 342 55 5 3307 258 211 3308 2086 675 2086 1 0 1 1 1 0 1 1 0 6 9 8 4 John 0 1 1 1
x = new_futures[new_futures.time_popular < 200].time_popular
fig, ax = plt.subplots(figsize=(15, 5))
sns.distplot(x, bins=200, kde=False, color='darkviolet', hist_kws=dict(edgecolor="k", linewidth=2))
<matplotlib.axes._subplots.AxesSubplot at 0x7f607f9b4950>
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))

user = 1
for idx, ax in enumerate(axes):
    for i, a in enumerate(ax):
        x = new_futures[(new_futures.time_popular < 200) & (new_futures.target == list_user[user-1])].time_popular
        sns.distplot(x, bins=20, kde=False, color='darkviolet', hist_kws=dict(edgecolor="k", linewidth=2), ax=axes[idx, i])
        axes[idx, i].set_xlabel('Time on popular sites in seconds')
        axes[idx, i].set_ylabel('Frequency')
        axes[idx, i].set_title(list_user[user-1])
        plt.tight_layout()
        user = user + 1

Thanks to the analysis, it is clear that people usually visit the top 10 sites for a short time. You can also count separately for a popular social network. But even now, three fans stand out among the users.

In the end, I will save only those signs in the pickle files that I assume will help identify the user more accurately. This applies to the signs that were created at the beginning (session_timespan, #unique_sites, start_hour, day_of_week), and my own. You can create all these attributes not only for sessions from 10 sites, but also for other combinations of session_length and window_size parameters.

selected_features_10users = feature_engineering(train_data_10users)
selected_features_150users = feature_engineering(train_data_150users)
with open(os.path.join(PATH_TO_DATA, 
                       'selected_features_10users.pkl'), 'wb') as selected_features_10users_pkl:
    pickle.dump(selected_features_10users, selected_features_10users_pkl, 
                protocol=2)
with open(os.path.join(PATH_TO_DATA, 
                       'selected_features_150users.pkl'), 'wb') as selected_features_150users_pkl:
    pickle.dump(selected_features_150users, selected_features_150users_pkl, 
                protocol=2)