#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
| @purpose: Process participants metadata for analysis and export.
| @date: Created on Sat May 1 15:12:38 2019
| @author: Semeon Risom
| @email: semeon.risom@gmail.com
| @url: https://semeon.io/d/R33-analysis
"""
# available classes and functions
__all__ = ['Metadata']
# required external libraries
__required__ = ['pandas','numpy','json']
# core
from pdb import set_trace as breakpoint
import os
import pandas as pd
import numpy as np
import json
# local libraries
from . import settings
[docs]class Metadata():
"""Process participants metadata for analysis and export."""
def __init__(self, isLibrary=False):
"""Process participants metadata for analysis and export.
Parameters
----------
isLibrary : :obj:`bool`
Check if required libraries are available. Default `False`.
"""
#check libraries
if isLibrary:
settings.library(__required__)
[docs] @classmethod
def summary(cls, df, path):
"""
Preparing data for use in analysis.
Parameters
----------
df : :obj:`str`
Pandas dataframe of raw data.
path : :obj:`str`
The directory path of the subject data
Attributes
----------
path : :obj:`str`
Specific directory path used.
attr2 : :obj:`str`, optional
Description of `attr2`.
Returns
-------
df : :class:`numpy.ndarray`
Pandas dataframe of processed metadata.
Notes
-----
You can either get data from all files within a directory (directory), or from a specific
subject (subject_session).
Examples
--------
>>> #if using path:
>>> df = getData(path=self.config['path'])
>>> #if getting data for single subject:
>>> df = getData(path=self.config['path'],subject_session=['1099','1', '0'])
"""
#drop subject 111111, 999999, nan
df = df.drop(df[(df['participant']==111111)|(df['participant']==999999)].index)
df['participant'] = df['participant'].apply(pd.to_numeric)
"""processing data"""
#rename browser, os, date
df.rename(columns={'browser':'browser_old','os':'os_old','date':'date_old'}, inplace=True)
"""gpu_type"""
df['gpu_type'] = 'integrated'
df.loc[df['gpu'].str.contains('AMD', na=False),'gpu_type'] = 'dedicated'
df.loc[df['gpu'].str.contains('Nvidia', na=False),'gpu_type'] = 'dedicated'
df.loc[df['gpu'].str.contains('NVIDIA', na=False),'gpu_type'] = 'dedicated'
"""webcam brand"""
df['webcam_brand'] = df['WebcamDevice'].str.split().str.get(0)
"""webcam width"""
#replace "-1" with ".x."
df['webcamSize.px'] = df['webcamSize.px'].apply(lambda x: '.x.' if (x == -1) else x)
#replace ""0x0"" with ".x."
df['webcamSize.px'] = df['webcamSize.px'].apply(lambda x: '.x.' if (x == "0x0") else x)
df[['webcamWidth','webcamHeight']] = df['webcamSize.px'].apply(lambda x: pd.Series([i for i in x.split('x')]))
"""window width"""
df[['windowWidth','windowHeight']] = df['windowSize.px'].apply(lambda x: pd.Series([i for i in x.split('x')]))
"""monitor width"""
df['monitorSize old'] = df['monitorSize.px']
#adjust size back to value before manually multiplying by devicePixelratio
df[['monitorWidth','monitorHeight']] = df['monitorSize.px'].apply(lambda x: pd.Series([i for i in x.split('x')]))
df['monitorWidth'] = pd.to_numeric(df['monitorWidth']) / df['devicePixelRatio']
df['monitorHeight'] = pd.to_numeric(df['monitorHeight']) / df['devicePixelRatio']
"""browser"""
df['browser'] = 'None'
###new column for version without number
df.loc[df['browser_old'].str.contains('Chrome', na=False),'browser'] = 'Chrome'
df.loc[df['browser_old'].str.contains('Safari', na=False),'browser'] = 'Safari'
df.loc[df['browser_old'].str.contains('Edge', na=False),'browser'] = 'Edge'
df.loc[df['browser_old'].str.contains('Firefox', na=False),'browser'] = 'Firefox'
df.loc[df['browser_old'].str.contains('IE', na=False),'browser'] = 'IE'
df['browser_version'] = 'None'
###new column for version without number
df['browser_version'] = df['browser_old'].map(lambda x: x.lstrip('Chrome').rstrip('aAbBcC'))
df['browser_version'] = df['browser_old'].map(lambda x: x.lstrip('Safari').rstrip('aAbBcC'))
df['browser_version'] = df['browser_old'].map(lambda x: x.lstrip('Edge').rstrip('aAbBcC'))
df['browser_version'] = df['browser_old'].map(lambda x: x.lstrip('Firefox').rstrip('aAbBcC'))
df['browser_version'] = df['browser_old'].map(lambda x: x.lstrip('IE').rstrip('aAbBcC'))
"""os"""
df['os'] = 'None'
###new column for version without number
df.loc[df['os_old'].str.contains('Windows', na=False),'os'] = 'Microsoft Windows'
df.loc[df['os_old'].str.contains('Mac', na=False),'os'] = 'macOS'
df.loc[df['os_old'].str.contains('Chrome', na=False),'os'] = 'Chrome OS'
"""os version"""
df['os_version'] = 'None'
###new column for version without name
df['os_version'] = df['os_old'].map(lambda x: x.lstrip('Windows').rstrip('aAbBcC'))
df['os_version'] = df['os_version'].map(lambda x: x.lstrip('Mac OS X').rstrip('aAbBcC'))
df['os_version'] = df['os_version'].map(lambda x: x.lstrip('Chrome OS').rstrip('aAbBcC'))
"""date"""
##process date
df['date'] = [x.strip().replace('_', '-') for x in df['date_old']] ##remove underscore
df['date'] = pd.to_datetime(df['date']) #convert to pandas datetime format
df['date'] = df['date'].dt.date #remove time
df['date'] = df['date'].astype(str)
#rename
df = df.rename(columns={'windowSize.px':'windowSize','monitorSize.px':'monitorSize',\
'webcamSize.px':'webcamSize','lum':'luminance'})
#convert to cm
df['diagonalSize.cm'] = df['diagonalSize.in'].map(lambda x: round(x * 2.54, 3))
"""
clean unusual resolutions for monitor
"""
#rev = df['monitorSize'].apply(lambda x: pd.Series([i for i in x.split('x')]))
#rev.rename(columns={0:'monitorWidth',1:'monitorHeight'},inplace=True)
#convert to integer
df['monitorWidth'] = df['monitorWidth'].apply(pd.to_numeric)
df['monitorHeight'] = df['monitorHeight'].apply(pd.to_numeric)
#recombine
df["monitorSize"] = df['monitorWidth'].map(str).str.split('.').str[0] + 'x' + \
df['monitorHeight'].map(str).str.split('.').str[0]
"""
clean unusual dpi
"""
df['devicePixelRatio'] = df['devicePixelRatio'].apply(pd.to_numeric)
'''
modify gpu columns
'''
df['gpu'] = df['gpu'].fillna(np.NaN)
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('ANGLE ' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('(A' , 'A'))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('(I' , 'I'))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('(N' , 'N'))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('vs_5_' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('ps_5_' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('vs_3_' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('ps_3_' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace(' 0 0)' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('(R)' , ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('(TM)', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('OpenGL', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('Engine', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('Direct3D11', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('Direct3D9Ex', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('Family', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace('Mesa DRI ', ''))
df['gpu'] = df['gpu'].map(lambda x: str(x).replace(' (Skylake GT2)', ''))
"""
clean white space in column
"""
#remove trailing whitespace in gpu
df['gpu'] = df['gpu'].map(lambda x: x.strip())
'''
split WebcamDevice to retrieve vendor id and fix formatting
'''
##format to np.Nan if x=-1
df['webcamSize'] = df['webcamSize'].map(lambda x:
np.NaN if x=='0x0' else np.where(x==-1, np.NaN, x))
##format to np.Nan if x=-1
df['WebcamDevice'] = df['WebcamDevice'].map(lambda x:
np.NaN if x==-1 else str(x).replace('(Built-in) ', ''))
#df['WebcamDeviceVendor'] = df['WebcamDeviceVendor'].map(lambda x: x[x.find("(")+1:x.find(")")])
df[['WebcamDeviceProductID']] = df['WebcamDevice'].str.split('\(|\)', expand=True).iloc[:,[1]]
#rename variables
df = df.rename(columns={'trialNumTask':'TrialNum','Key_Resp.resp':'RT','isWebcamUsed':'is_eyetracking'})
#drop columns
cols = ['sampleNum', 'x', 'y', 'duration.t', 'Stim_onset.t', 'DotLoc_onset.t', 'blockNum',\
'trialNum', 'TrialNum', 'trialID', 'Key_Resp.rt', 'Key_Resp.cresp', 'Key_Resp.acc',\
'DotLoc', 'LEmotion', 'LStim', 'LDescription', 'REmotion', 'RStim', 'RDescription',
'trialType','isCongruent', 'event', 'trial_type', 'internal_node_id', 'RT']
#drop columns
# cols = ['sampleNum', 'timestamp', 'x', 'y', 'duration.t', 'Stim_onset.t', 'DotLoc_onset.t', 'blockNum',\
# 'trialNum', 'TrialNum', 'trialID', 'Key_Resp.rt', 'Key_Resp.resp', 'Key_Resp.cresp', 'Key_Resp.acc',\
# 'DotLoc', 'LEmotion', 'LStim', 'LDescription', 'REmotion', 'RStim', 'RDescription', 'trialType',\
# 'isCongruent', 'event', 'trial_type', 'internal_node_id', 'type', 'RT', 'marker', 'bad', 'sg_x',\
# 'sg_y', 'sg_class', 'left_bound', 'right_bound', 'sg_fix_all', 'sg_fix_index', 'sg_roi_bounds',\
# 'sg_fix_roi']
df.drop(cols, inplace=True, axis=1)
#------------------------------------------------save
print("demographics saved: %s"%(path))
df.to_csv(path, index=False)
return df
[docs] @classmethod
def predict(cls, df):
"""
Predicting screen size (cm), device (i.e. macbook 2018).
Parameters
----------
df : :class:`numpy.ndarray`
Pandas dataframe of raw data.
Returns
-------
df : :class:`numpy.ndarray`
Pandas dataframe of raw data.
"""
#clean up sub-version of data
df['os_version'] = np.where(df['os'] == 'OSX',\
df['os_version'].map(lambda x: x.replace(r'[^.]+', '')[:-2]),\
df['os_version']) #else
"""
import screensize sample list
"""
#import reference screen size
screensize_path = os.path.abspath(__file__+ '../../../info')
df_screensize = pd.read_excel(screensize_path+'/screensize.xlsx')
df_screensize = df_screensize.rename(columns={'resolution (px)': 'monitorSize'}) #rename for merge
df_screensize = df_screensize.rename(columns={'gpu': 'gpu list'}) #rename for merge
#remove excel non-breaking space \xa0
df_screensize['gpu list'] = df_screensize['gpu list'].replace({'\\xa0': ' '}, regex=True)
df_screensize['device'] = df_screensize['device'].replace({'\\xa0': ' '}, regex=True)
df_screensize['model id'] = df_screensize['model id'].replace({'\\xa0': ' '}, regex=True)
#clear leading and trailing white space in string
df_screensize['gpu list'] = df_screensize['gpu list'].astype(str).map(lambda x: x.strip())
#convert inches to cm
df_screensize['screen size (cm)'] = df_screensize['screen size (in)'].map(lambda x: round(x * 2.54, 3))
tt=df_screensize['gpu list'][0]
#convert each gpu cell into a list
#https://stackoverflow.com/a/47548471
#https://stackoverflow.com/questions/38133961/pandas-how-to-store-a-list-in-a-dataframe
#https://stackoverflow.com/questions/35565376/insert-list-of-lists-into-single-column-of-pandas-df
df_screensize['gpu list']=df_screensize['gpu list'].map(lambda x: list(map(str.strip,x.split(","))))
'''
merge location (lab or home) df and df_summary data
'''
df_all = pd.merge(df, df_summary[['participant','session','location']],on=['participant','session'], how='left')
df_all.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
df_lab=df_all.copy().reset_index(drop=True)
'''
lab computer (if: subject is in lab and using one of the lab machines)
'''
#filter
df_screensize_filter=df_screensize.copy().reset_index(drop=True)
df_screensize_filter = df_screensize_filter.loc[df_screensize['is lab computer'] == True].reset_index(drop=True)
df_screensize_filter['location'] = 'lab'
df_screensize_filter['exact match'] = True
#preparing new variables for df_osx
df_screensize_filter['devices'] = df_screensize_filter['device']
df_screensize_filter['model id'] = df_screensize_filter['model id']
df_screensize_filter['resolution (px)'] = df_screensize_filter['monitorSize']
#combine all
df_lab = pd.merge(df_lab,df_screensize_filter[['os','gpu list','monitorSize','location','screen size (cm)',\
'pixel density (ppi)','exact match',\
'devices','model id', 'resolution (px)']],\
on=['os','monitorSize','location'], how='left')
df_lab = df_lab.loc[df_lab['exact match'] == True].reset_index(drop=True)
df_lab.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
#save summary
types_df_lab = df_lab.dtypes
df_test = df_lab[['participant','session','subsession','os','os_version','gpu',\
'diagonalSize.cm','screen size (cm)','gpu list','exact match']]
df_test.to_excel(cwd_save+'/lab_summary.xlsx', index=False)
'''
osx devices
'''
#filter
df_screensize_filter=df_screensize.copy().reset_index(drop=True)
df_screensize_filter = df_screensize.loc[df_screensize['os'] == 'OSX'].reset_index(drop=True)
df_osx = df_all.loc[df_all['os'] == 'OSX'].reset_index(drop=True)
df_osx.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
#check each gpu to see if there is more than one matching
df_osx['gpu list'] = 'nan' #gpu list
df_osx['screen size (cm)'] = 'nan' #screen size
df_osx['pixel density (ppi)'] = 'nan' #pixel density
df_osx['exact match'] = 'nan' #only single match
df_osx['devices'] = 'nan' #devices
df_osx['model id'] = 'nan' #model id
df_osx['resolution (px)'] = 'nan' #resolution
for idx, rw in df_osx.iterrows():
gpu = rw['gpu']
l_match_d = [] #matching devices
l_match_mid = [] #matching model id
l_match_ss = [] #matching screen size
l_match_res = [] #matching resolution
l_match_gpu = [] #matching gpu
l_match_px = [] #matching pixel density
#for each device
for index, row in df_screensize_filter.iterrows():
#l_row = map(str.strip, row['gpu list']) #strip items in list
l_row = row['gpu list']
#if gpu in list
if [x for x in l_row if gpu.lower() in x.lower()].__len__() > 0:
#add device to list
l_match_d.append(str(row['device']))
l_match_mid.append(str(row['model id']))
l_match_gpu.append(str(row['gpu list']))
l_match_px.append(str(row['pixel density (ppi)']))
l_match_ss.append(str(row['screen size (cm)']))
l_match_res.append(str(row['monitorSize']))
#if no matches
if l_match_d.__len__() == 0:
df_osx['devices'][idx] = 'nan'
df_osx['model id'][idx] = 'nan'
df_osx['gpu list'][idx] = 'nan'
df_osx['pixel density (ppi)'][idx] = 'nan'
df_osx['screen size (cm)'][idx] = 'nan'
df_osx['resolution (px)'][idx] = 'nan'
df_osx['exact match'][idx] = False
#if only one device add immediately
elif l_match_d.__len__() == 1:
df_osx['devices'][idx] = l_match_d[0]
df_osx['model id'][idx] = l_match_mid[0]
df_osx['gpu list'][idx] = l_match_gpu[0]
df_osx['pixel density (ppi)'][idx] = l_match_px[0]
df_osx['screen size (cm)'][idx] = l_match_ss[0]
df_osx['resolution (px)'][idx] = l_match_res[0]
df_osx['exact match'][idx] = True
#if multiple matches
elif l_match_d.__len__() > 1:
#add device to dataframe
df_osx['devices'][idx] = l_match_d
df_osx['model id'][idx] = l_match_mid
df_osx['gpu list'][idx] = l_match_gpu
df_osx['pixel density (ppi)'][idx] = l_match_px
df_osx['screen size (cm)'][idx] = l_match_ss
df_osx['resolution (px)'][idx] = l_match_res
df_osx['exact match'][idx] = False
#save summary
types_df_osx = df_osx.dtypes
df_test = df_osx[['participant','session','subsession','os','os_version','gpu',\
'diagonalSize.cm','screen size (cm)','gpu list','exact match']]
df_test.to_excel(cwd_save+'/osx_summary.xlsx', index=False)
'''
chromebook devices
'''
#filter
df_screensize_filter = df_screensize.loc[df_screensize['os'] == 'Chrome'].reset_index(drop=True)
df_chrome = df_all.loc[df_all['os'] == 'Chrome'].reset_index(drop=True)
df_chrome.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
'''#----------------------------attempt 1'''
#check each gpu to see if there is more than one matching
df_chrome['gpu list'] = 'nan' #gpu list
df_chrome['screen size (cm)'] = 'nan' #screen size
df_chrome['pixel density (ppi)'] = 'nan' #pixel density
df_chrome['exact match'] = 'nan' #only single match
df_chrome['devices'] = 'nan' #devices
df_chrome['model id'] = 'nan' #model id
df_chrome['resolution (px)'] = 'nan' #resolution
idx=0
rw=0
index=0
row=0
for idx, rw in df_chrome.iterrows():
gpu = rw['gpu']
l_match_d = [] #matching devices
l_match_mid = [] #matching model id
l_match_ss = [] #matching screen size
l_match_res = [] #matching resolution
l_match_gpu = [] #matching gpu
l_match_px = [] #matching pixel density
#for each device
for index, row in df_screensize_filter.iterrows():
#l_row = map(str.strip, row['gpu list']) #strip items in list
l_row = row['gpu list']
#if gpu in list
if [x for x in l_row if gpu.lower() in x.lower()].__len__() > 0:
#add device to list
l_match_d.append(str(row['device']))
l_match_mid.append(str(row['model id']))
l_match_gpu.append(str(row['gpu list']))
l_match_px.append(str(row['pixel density (ppi)']))
l_match_ss.append(str(row['screen size (cm)']))
l_match_res.append(str(row['monitorSize']))
#if no matches
if l_match_d.__len__() == 0:
df_chrome['devices'][idx] = 'nan'
df_chrome['model id'][idx] = 'nan'
df_chrome['gpu list'][idx] = 'nan'
df_chrome['pixel density (ppi)'][idx] = 'nan'
df_chrome['screen size (cm)'][idx] = 'nan'
df_chrome['resolution (px)'][idx] = 'nan'
df_chrome['exact match'][idx] = False
#if only one device add immediately
elif l_match_d.__len__() == 1:
df_chrome['devices'][idx] = l_match_d[0]
df_chrome['model id'][idx] = l_match_mid[0]
df_chrome['gpu list'][idx] = l_match_gpu[0]
df_chrome['pixel density (ppi)'][idx] = l_match_px[0]
df_chrome['screen size (cm)'][idx] = l_match_ss[0]
df_chrome['resolution (px)'][idx] = l_match_res[0]
df_chrome['exact match'][idx] = True
#if multiple matches
elif l_match_d.__len__() > 1:
#add device to dataframe
df_chrome['devices'][idx] = l_match_d
df_chrome['model id'][idx] = l_match_mid
df_chrome['gpu list'][idx] = l_match_gpu
df_chrome['pixel density (ppi)'][idx] = l_match_px
df_chrome['screen size (cm)'][idx] = l_match_ss
df_chrome['resolution (px)'][idx] = l_match_res
df_chrome['exact match'][idx] = False
#save summary
types_df_chrome = df_chrome.dtypes
df_test = df_chrome[['participant','session','subsession','os','os_version','gpu',\
'diagonalSize.cm','screen size (cm)','gpu list','exact match']]
df_test.to_excel(cwd_save+'/chrome_summary.xlsx', index=False)
'''
combine data---------osx, chrome, and lab
'''
#sort
df_lab.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
df_osx.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
df_chrome.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
#concat
df_merge = pd.concat([df_lab, df_osx, df_chrome], ignore_index=True)
df_merge.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
#arrange columns
df_merge = df_merge.reindex(list(df_osx.columns), axis=1)
'''
combine data---------all
'''
df_merge_small = df_merge[
['participant','session','subsession','devices','exact match','resolution (px)',
'screen size (cm)','gpu list', 'pixel density (ppi)']]
df_f = pd.merge(df, df_merge_small, on=['participant','session','subsession'], how='outer')
#fix gpu lists
df_f['gpu list'] = df_f['gpu list'].map(lambda x: str(x).replace('"[' , '['))
df_f['gpu list'] = df_f['gpu list'].map(lambda x: str(x).replace(']"' , ']'))
#fix devices lists
df_f['devices'] = df_f['devices'].astype(str)
df_f['resolution (px)'] = df_f['resolution (px)'].astype(str)
df_f['screen size (cm)'] = df_f['screen size (cm)'].astype(str)
df_f['pixel density (ppi)'] = df_f['pixel density (ppi)'].astype(str)
#drop gpu lists
df_f = df_f[df_f.columns.drop('gpu list')]
#merge location (lab or home) df and df_summary data
##prepare
df_sum = df_summary[['participant','session','subsession','location']]
df_sum.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
df_f.sort_values(by=['participant','session','subsession']).reset_index(drop=True)
##drop unusual participants
df_f = df_f.drop(df_f[(df_f['participant']==111111)
| (df_f['participant']==999999)].index)
df_sum = df_sum.drop(df_sum[(df_sum['participant']==111111)
| (df_sum['participant']==999999)].index)
##convert types
df_f['participant'] = df_f['participant'].astype(float)
df_f['session'] = df_f['session'].astype(float)
df_sum['participant'] = df_sum['participant'].astype(float)
df_sum['session'] = df_sum['session'].astype(float)
##merge
df_f = pd.merge(df_f, df_sum,
on=['participant','session','subsession'], how='left')
#export
finished_json = df_f.to_json(orient='records')
finished_json = 'json_data =' + finished_json
with open(cwd_save+'/summary.json', 'w+') as f:
f.write(json.dumps(finished_json,indent=4).strip('"').replace('\\',''))
f.close()
return df