Projection-based Person Identification

All the required files are available here: http://biointelligence.hu/typing-challenge/task2/index.php

import numpy as np
import pyhubs
import random

Preprocessing the ids and labels for training and test data

train_data = [] #ids of train instances
train_labels = [] # labels of the train instances
with open('task2-keystrokes-12users-train-labels.txt') as f:
    for line in f:
        train_data += [ int(line.split(",")[0]) ]
        train_labels += [ int(line.split(",")[1]) ]
f.close()

train_labels = np.array(train_labels)
train_data = np.array(train_data)

test_data = [] #ids of test instances
#test_labels = []
with open('task2-random-submission.txt') as f:
    for line in f:
        test_data += [ int(line.split(",")[0]) ]
f.close()
test_data = np.array(test_data)

The file containing the DTW distances is calculated the following way: From the raw data, task2-keystrokes-12users-raw-data.txt, we make keystroke duration time-series, i.e., each element of the time-series corresponds to the duration of each keystroke, which is the difference of the times of the keyup and keydown events. Thus each typing session is converted into a time series. Then we calculate the DTW distances of the time series by calling the DTW calculating function from the PhyHubs library for each pair of time series. From this data we generate a matrix which consists of the DTW distances, called dtw_dist_dur.

#DTW matrices

dtw_dist_dur = np.genfromtxt("dtw_distances_duration.txt",delimiter=" ")

#returns the DTW distance of the i and j elements
def get_dist_dur(i,j):
    return dtw_dist_dur[i,j]

Projection of the data for n selected instances

def project(n,seed):
    random.seed(seed)
    selected=[]
    #selecting the n elements for which we project all the elemets
    while(len(selected)<n):
        r=random.randint(0,len(train_data)-1)
        if(r not in selected):
            selected+=[r]
          
    ids_of_selected = train_data[selected]
    
    train_projected = np.zeros( (len(train_data),n) )
    test_projected = np.zeros( (len(test_data),n) )
    
    for i in range(len(train_data)):
        for j in range(len(ids_of_selected)):
            train_projected[i,j] = get_dist_dur(train_data[i],ids_of_selected[j])
            #train_projected[i,n+j] = get_dist_btw(train_data[i],ids_of_selected[j])
            
    for i in range(len(test_data)):
        for j in range(len(ids_of_selected)):
            test_projected[i,j] = get_dist_dur(test_data[i],ids_of_selected[j])
            #test_projected[i,n+j] = get_dist_btw(test_data[i],ids_of_selected[j])
            
        
    return (train_projected,test_projected)

#Example for the usage of project
train_projected,test_projected = project(3,2)

train_projected

array([[  2992.,   4152.,   5776.],
       [  6419.,   8143.,   8139.],
       [  3892.,   4847.,   5945.],
       [  5750.,   6408.,      0.],
       [  4292.,   5441.,   5665.],
       [  3714.,   3745.,   5945.],
       [  3052.,   4784.,   5549.],
       [  5015.,   6053.,   5431.],
       [  3911.,   3768.,   6385.],
       [  5573.,   6595.,   7600.],
       [  4332.,   4269.,   6790.],
       [  3531.,   3712.,   5678.],
       [  3050.,   4493.,   5576.],
       [  5401.,   4603.,   7438.],
       [  4325.,   4673.,   6170.],
       [  2995.,   4512.,   5780.],
       [  4373.,   3021.,   6554.],
       [  4424.,   5148.,   5927.],
       [  4289.,   4291.,   6395.],
       [  4372.,   3919.,   6872.],
       [  3969.,   3614.,   5685.],
       [  4405.,   4197.,   6635.],
       [  3844.,   3438.,   6293.],
       [  3755.,   3176.,   6203.],
       [  3463.,   4424.,   5393.],
       [  4700.,   3854.,   7534.],
       [  5002.,   6246.,   7431.],
       [  5100.,   6122.,   7313.],
       [  2880.,   4274.,   5831.],
       [  4674.,   4129.,   6447.],
       [  5531.,   6489.,   7486.],
       [  3718.,   3896.,   5344.],
       [  3035.,   4672.,   5671.],
       [  3594.,   3462.,   5563.],
       [  4092.,   3326.,   6595.],
       [  3663.,   4416.,   5673.],
       [  3676.,   3915.,   5623.],
       [  4501.,   4257.,   6583.],
       [  4226.,   3773.,   6413.],
       [  3058.,   4399.,   5843.],
       [  3285.,   3926.,   5299.],
       [  4030.,   3316.,   6185.],
       [  4594.,   5682.,   4315.],
       [  5075.,   5782.,   7243.],
       [  3084.,   4429.,   5768.],
       [  4042.,   3312.,   6217.],
       [  4170.,   3616.,   6730.],
       [  7845.,  10088.,   9984.],
       [  3637.,   3468.,   6290.],
       [  3672.,   4494.,   5576.],
       [  4324.,   3199.,   6518.],
       [  4020.,   3837.,   6309.],
       [  3763.,   3790.,   6208.],
       [  4044.,   3358.,   6385.],
       [  4361.,      0.,   6408.],
       [     0.,   4361.,   5750.],
       [  3434.,   3911.,   5748.],
       [  4340.,   3242.,   6409.],
       [  2960.,   4403.,   5818.],
       [  4094.,   3354.,   6687.]])

#Training a logistic regression classifier on the projected data
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
train_projected, test_projected = project(60,1)
cls = LogisticRegression(penalty='l2')
cls.fit(train_projected,train_labels)
#Using the trained logistic regression classifier to predict the label of the test data
predicted_labels = np.array(cls.predict(test_projected))

Writing out in a file the test data and the predicted labels for the test data

This file can be submitted to the 2nd Task of the Person Identification Challenge so that the predictions are evaluated.

f = open('submit-process.txt','w')
for i in range(len(test_data)):
    f.write( str(test_data[i])+","+str(predicted_labels[i])+"\n" )
f.close()