Source code for imhr.Webgazer.raw

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
| @purpose: Automate data importing UTWeb Server.   
| @date: Created on Sat May 1 15:12:38 2019   
| @author: Semeon Risom   
| @email: semeon.risom@gmail.com   
| @url: https://semeon.io/d/R33-analysis   
"""

from pdb import set_trace as breakpoint
import os, datetime
        
'''import packages'''
[docs]class raw():
    """processing summary data for output"""
    def __init__(self, is_library=False):
        """Download raw data from UTWeb server for use in analysis.
        
        Parameters
        ----------
        is_library : :obj:`bool`
            Check if required libraries are available.
        """
        #check libraries
        if is_library:
            self.library()

[docs]    def library(self):
        """Check if required libraries are available."""

        #check libraries for missing
        from distutils.version import StrictVersion
        import importlib, pkg_resources, pip, platform
        
        #list of possibly missing packages to install
        required = ['pandas','openpyxl','pysftp','utils','cryptography','paramiko']
        
        #for geting os variables
        if platform.system() == "Windows":
            required.append('win32api')
        elif platform.system() =='Darwin':
            required.append('pyobjc')
        
        #try installing and/or importing packages
        try:
            #if pip >= 10.01
            pip_ = pkg_resources.get_distribution("pip").version
            if StrictVersion(pip_) > StrictVersion('10.0.0'):
                from pip._internal import main as _main
                #for required packages check if package exists on device
                for package in required:
                    #if missing, install
                    if importlib.util.find_spec(package) is None:
                        _main(['install',package])
                    #else import
                    else:
                        __import__(package)
                        
            #else pip < 10.01          
            else:
                #for required packages check if package exists on device
                for package in required:
                    #if missing
                    if importlib.util.find_spec(package) is None:
                        pip.main(['install',package])
                    #else import
                    else:
                        __import__(package)
                
        except Exception as e:
            return e
        
[docs]    def download(self, l_exp, log_path, save_path, hostname, username, password):
        """Download raw data for use in analysis.
        
        Parameters
        ----------
        l_exp : :obj:`str`
            The list of experiments to pull data from.
        log_path : :obj:`str`
            The directory path to save the log of participant data downloaded.
        save_path : :obj:`str`
            The directory path to save paticipant data.
        hostname : :obj:`str`
            SSH hostname.
        username : :obj:`str`
            SSH username.
        password : :obj:`str`
            SSH password.
        """
        import paramiko, openpyxl
        import pandas as pd
        #current date
        now = datetime.datetime.now()

        #----for every experiment
        for exp in l_exp:
            save = exp['save']
            task = exp['task']
            #partial
            if (task != "wf_js") and (task != "gaze_js"):
                folder = save_path + '/' + save + '/part'
                if not os.path.exists(folder):
                    print('creating local folder: %s'%(folder))
                    os.makedirs(folder)
            #full
            folder = save_path + '/' + save
            if not os.path.exists(folder):
                print('creating local folder: %s'%(save))
                os.makedirs(folder)
        
        #breakpoint()
        '''
        connect to sftp and download files
        def: intermediate - saved block of task (a[=first block] or ab[=first two blocks])
        def: full - saved block of task (abc[=all blocks])
        '''
        print('connecting to sftp')
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 
        client.connect(hostname=hostname,username=username,password=password)
        sftp = client.open_sftp()
        #for each experiment
        l_row = [] #log of updated files
        highest_date = [] #most recent participants for each task
        for exp in l_exp:
            path = exp['path']
            task = exp['task']
            save = exp['save']
            newpartnum = 0 #intermediate task data counter
            newfullnum = 0 #full task data file counter
            newpartname = 'nan' #most recent partial file names
            newfullname = 'nan' #most recent full file names
            
            #----intermediate files - only relevant for wf_js and gaze_js
            if (task != "wf_js") and (task != "gaze_js"): 
                sftp.chdir(path+'/part')
                remote_path = str(sftp.getcwd())
                remote_directory = sftp.listdir() 
                #for each file in remote directory
                for filename in remote_directory:
                    remote_fpath = remote_path +'/'+ filename
                    local_fpath = save_path +'/'+ save +'/part/'+ filename
                    #check if file already exists and is a csv
                    if (filename.endswith('.csv')) and not (os.path.isfile(local_fpath)):
                        sftp.get(remote_fpath, local_fpath)
                        newpartnum = newpartnum + 1
                        newpartname = filename
                    
            #number of files in remote directory
            total_part = ([x for x in remote_directory if ".csv" in x]).__len__()
            
            #----full files
            sftp.chdir(path)
            remote_path = str(sftp.getcwd())
            remote_directory = sftp.listdir()
            #get date of most recent file on server
            #print(remote_directory)
            ldate = []
            for fileattr in sftp.listdir_attr():
                if fileattr.filename.endswith('.csv'):
                    ldate.append([fileattr.st_mtime,fileattr.filename])
                #print('latest: %s'%(ldate))
            ##get highest date
            highest_date.append({task:max(ldate, key=lambda item: item[0])})
            
            #for each file in remote directory
            for filename in remote_directory:
                remote_fpath = remote_path +'/'+ filename
                local_fpath = save_path +'/'+ save +'/'+ filename
                #check if file already exists and is a csv
                if (filename.endswith('.csv')) and not (os.path.isfile(local_fpath)):
                    sftp.get(remote_fpath, local_fpath)
                    newfullnum = newfullnum + 1
                    newfullname = filename
                    
                    
            #number of files in remote directory    
            total_full = ([x for x in remote_directory if ".csv" in x]).__len__() 
            print('total files for %s: %s'%(task, total_full))
            
            #append log list
            row = [str(now.strftime('%Y-%m-%d %H:%M')),task,
                   newpartnum,newfullnum,
                   total_part,total_full,
                   newpartname,newfullname]
            l_row.append(row)

        #closing sftp    
        sftp.close()
        
        #--------------------------------------log
        log_path = log_path +"/participants.xlsx"
        #if file exists
        if os.path.exists(log_path):
            print("log updated: %s"%(log_path))
            #load file
            wb = openpyxl.load_workbook(log_path)
            # Select First Worksheet
            ws = wb.worksheets[0]
            #add data
            for fields in l_row:
                ws.append(fields)
            #update
            wb.save(log_path)
        
        #else create file
        else:
            print("raw data saved: %s"%(save_path))
            headers = ['date','task','uploaded intermediate files','uploaded full files','total intermediate files', 
                       'total full files','most recent intermediate file', 'most recent full file']
            #creating datafame and save as xlsx
            df = pd.DataFrame(l_row, columns=headers)
            df.to_excel(log_path, index=False)

        return highest_date