Source code for imhr.Webgazer.raw

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
| @purpose: Automate data importing UTWeb Server.   
| @date: Created on Sat May 1 15:12:38 2019   
| @author: Semeon Risom   
| @email: semeon.risom@gmail.com   
| @url: https://semeon.io/d/R33-analysis   
"""

from pdb import set_trace as breakpoint
import os, datetime
        
'''import packages'''
[docs]class raw(): """processing summary data for output""" def __init__(self, is_library=False): """Download raw data from UTWeb server for use in analysis. Parameters ---------- is_library : :obj:`bool` Check if required libraries are available. """ #check libraries if is_library: self.library()
[docs] def library(self): """Check if required libraries are available.""" #check libraries for missing from distutils.version import StrictVersion import importlib, pkg_resources, pip, platform #list of possibly missing packages to install required = ['pandas','openpyxl','pysftp','utils','cryptography','paramiko'] #for geting os variables if platform.system() == "Windows": required.append('win32api') elif platform.system() =='Darwin': required.append('pyobjc') #try installing and/or importing packages try: #if pip >= 10.01 pip_ = pkg_resources.get_distribution("pip").version if StrictVersion(pip_) > StrictVersion('10.0.0'): from pip._internal import main as _main #for required packages check if package exists on device for package in required: #if missing, install if importlib.util.find_spec(package) is None: _main(['install',package]) #else import else: __import__(package) #else pip < 10.01 else: #for required packages check if package exists on device for package in required: #if missing if importlib.util.find_spec(package) is None: pip.main(['install',package]) #else import else: __import__(package) except Exception as e: return e
[docs] def download(self, l_exp, log_path, save_path, hostname, username, password): """Download raw data for use in analysis. Parameters ---------- l_exp : :obj:`str` The list of experiments to pull data from. log_path : :obj:`str` The directory path to save the log of participant data downloaded. save_path : :obj:`str` The directory path to save paticipant data. hostname : :obj:`str` SSH hostname. username : :obj:`str` SSH username. password : :obj:`str` SSH password. """ import paramiko, openpyxl import pandas as pd #current date now = datetime.datetime.now() #----for every experiment for exp in l_exp: save = exp['save'] task = exp['task'] #partial if (task != "wf_js") and (task != "gaze_js"): folder = save_path + '/' + save + '/part' if not os.path.exists(folder): print('creating local folder: %s'%(folder)) os.makedirs(folder) #full folder = save_path + '/' + save if not os.path.exists(folder): print('creating local folder: %s'%(save)) os.makedirs(folder) #breakpoint() ''' connect to sftp and download files def: intermediate - saved block of task (a[=first block] or ab[=first two blocks]) def: full - saved block of task (abc[=all blocks]) ''' print('connecting to sftp') client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(hostname=hostname,username=username,password=password) sftp = client.open_sftp() #for each experiment l_row = [] #log of updated files highest_date = [] #most recent participants for each task for exp in l_exp: path = exp['path'] task = exp['task'] save = exp['save'] newpartnum = 0 #intermediate task data counter newfullnum = 0 #full task data file counter newpartname = 'nan' #most recent partial file names newfullname = 'nan' #most recent full file names #----intermediate files - only relevant for wf_js and gaze_js if (task != "wf_js") and (task != "gaze_js"): sftp.chdir(path+'/part') remote_path = str(sftp.getcwd()) remote_directory = sftp.listdir() #for each file in remote directory for filename in remote_directory: remote_fpath = remote_path +'/'+ filename local_fpath = save_path +'/'+ save +'/part/'+ filename #check if file already exists and is a csv if (filename.endswith('.csv')) and not (os.path.isfile(local_fpath)): sftp.get(remote_fpath, local_fpath) newpartnum = newpartnum + 1 newpartname = filename #number of files in remote directory total_part = ([x for x in remote_directory if ".csv" in x]).__len__() #----full files sftp.chdir(path) remote_path = str(sftp.getcwd()) remote_directory = sftp.listdir() #get date of most recent file on server #print(remote_directory) ldate = [] for fileattr in sftp.listdir_attr(): if fileattr.filename.endswith('.csv'): ldate.append([fileattr.st_mtime,fileattr.filename]) #print('latest: %s'%(ldate)) ##get highest date highest_date.append({task:max(ldate, key=lambda item: item[0])}) #for each file in remote directory for filename in remote_directory: remote_fpath = remote_path +'/'+ filename local_fpath = save_path +'/'+ save +'/'+ filename #check if file already exists and is a csv if (filename.endswith('.csv')) and not (os.path.isfile(local_fpath)): sftp.get(remote_fpath, local_fpath) newfullnum = newfullnum + 1 newfullname = filename #number of files in remote directory total_full = ([x for x in remote_directory if ".csv" in x]).__len__() print('total files for %s: %s'%(task, total_full)) #append log list row = [str(now.strftime('%Y-%m-%d %H:%M')),task, newpartnum,newfullnum, total_part,total_full, newpartname,newfullname] l_row.append(row) #closing sftp sftp.close() #--------------------------------------log log_path = log_path +"/participants.xlsx" #if file exists if os.path.exists(log_path): print("log updated: %s"%(log_path)) #load file wb = openpyxl.load_workbook(log_path) # Select First Worksheet ws = wb.worksheets[0] #add data for fields in l_row: ws.append(fields) #update wb.save(log_path) #else create file else: print("raw data saved: %s"%(save_path)) headers = ['date','task','uploaded intermediate files','uploaded full files','total intermediate files', 'total full files','most recent intermediate file', 'most recent full file'] #creating datafame and save as xlsx df = pd.DataFrame(l_row, columns=headers) df.to_excel(log_path, index=False) return highest_date