Source code for imhr.data.download

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
| `@purpose`: Download raw data from apache, Box, or REDCap servers.  
| `@date`: Created on Sat May 1 15:12:38 2019  
| `@author`: Semeon Risom  
| `@email`: semeon.risom@gmail.com  
| `@url`: https://semeon.io/d/imhr
"""

# available functions
__all__ = ['Download']

# required = 
__required__ = ['openpyxl','openpyxl']
# core
from pdb import set_trace as breakpoint

# local libraries
from .. import settings

# check if psychopy is available
try:
	# core
	import os
	from datetime import datetime
	from pathlib import Path
except ImportError as e:
	pkg = e.name
	x = {'psychopy':'psychopy','pandas':'pandas'}
	pkg = x[pkg] if pkg in x else pkg
	raise Exception("No module named '%s'. Please install from PyPI before continuing."%(pkg),'red')

[docs]class Download(): """Download raw data from apache, Box, or REDCap servers.""" @classmethod def __init__(self, isLibrary=False): """Download raw data from apache, Box, or REDCap servers. Parameters ---------- isLibrary : :obj:`bool` Check if required libraries are available. """ #check libraries if isLibrary: settings.library(__required__)
[docs] @classmethod def REDCap(cls, path, token, url, content, payload=None, **kwargs): """Download data from an Research Electronic Data Capture (REDCap) server. Parameters ---------- path : :obj:`str` Path to save data. For example:: >>> path = '/Users/mdl-admin/Desktop/r33/redcap' token : :obj:`str` The API token specific to your REDCap project and username. This is usually found on the Applications > API page. For example:: >>> token = 'D19859823032SFDMR24395298' url : :obj:`str` The URL of the REDCap project. For example:: >>> url = 'https://redcap.prc.utexas.edu/redcap/api/'. content : :obj:`str` {report, file, raw, arm, instrument, returnFormat, metadata, project, surveyLink, user, participantList} Type of export. Examples include exporting a report (`report`), file (`file`), or project info (`project`). payload_ : :obj:`dict` or :obj:`None`, optional Manually submit parameters for exporting or importing data. Can be entered within the function for convenience. **kwargs : :obj:`str` or :obj:`None`, optional Additional properties, relevent for specific content types. Here's a list of available properties: .. list-table:: :class: kwargs :widths: 25 50 :header-rows: 1 * - Property - Description * - **report_id** : :obj:`str` - (report, record) The report ID number provided next to the report name on the report list page. * - **cformat** : :obj:`str` {csv, json, xml, odm} - Format to return data, either csv, json, xml, or odm. Default is json. * - **ctype** : :obj:`str` - Shape of data. Default is flat. * - **rawOrLabel**: :obj:`str` {raw, label} - (report, record) TExport the raw coded values or labels for the options of multiple choice fields. * - **rawOrLabelHeaders**: :obj:`str` - (report, record) TExport the variable/field names (raw) or the field labels (label). * - **exportCheckboxLabel**: :obj:`str` - Specifies the format of checkbox field values specifically when exporting the data as labels (i.e., when rawOrLabel=label). * - **returnFormat** : :obj:`str` - Format to return errors. Default is `json`. Returns ------- log : :obj:`pandas.DataFrame` or :obj:`None` Pandas dataframe of each download request. content : :obj:`pandas.DataFrame` or :obj:`None` Pandas dataframe of all data downloaded. start, end : :obj:`str` Timestamp (ISO format) and name of most recent (`end`) and first (`start`) file created in folder. now : :obj:`str` Current timestamp in ISO format. """ import openpyxl, requests import pandas as pd #----constants, lists to prepare ldate = [] #list of dates file_num = 0 #file counter # bool log = None #log of events start = None #most recent file end = None #most recent file #----path # log name = Path(path).name log_path = os.path.abspath(os.path.dirname(path + "/../")) # destination path = Path(path) #----kwargs # general cformat = kwargs['cformat'] if "cformat" in kwargs else 'json' ctype = kwargs['ctype'] if "ctype" in kwargs else 'flat' rawOrLabel = kwargs['rawOrLabel'] if "rawOrLabel" in kwargs else 'raw' rawOrLabelHeaders = kwargs['rawOrLabelHeaders'] if "rawOrLabelHeaders" in kwargs else 'raw' exportCheckboxLabel = kwargs['exportCheckboxLabel'] if "exportCheckboxLabel" in kwargs else 'false' returnFormat = kwargs['returnFormat'] if "returnFormat" in kwargs else 'json' # report report_id = kwargs['report_id'] if "report_id" in kwargs else 'None' # participantList instrument = kwargs['instrument'] if "instrument" in kwargs else 'cssrs' event = kwargs['event'] if "event" in kwargs else 'online_eligibility_arm_1' #----start settings.console('connecting to REDCap', 'blue') #----make sure local path exists settings.console('local folder: %s'%(Path(path)), 'blue') if not os.path.exists(Path(path)): settings.console('creating local folder: %s'%(Path(path)), 'blue') os.makedirs(Path(path)) #----start download # prepare if payload == None: # report if content == 'report': payload = { 'token': token, 'content': content, 'format': cformat, 'type': ctype, 'returnFormat': returnFormat, 'report_id': report_id, 'rawOrLabel': rawOrLabel, 'rawOrLabelHeaders': rawOrLabelHeaders, 'exportCheckboxLabel': exportCheckboxLabel } elif content == 'participantList': payload = { 'token': token, 'content': content, 'format': cformat, 'returnFormat': returnFormat, 'instrument': instrument, 'event': event, } elif content == 'metadata': payload = { 'token': token, 'content': content, 'format': cformat, 'returnFormat': returnFormat, } elif content == 'project': payload = { 'token': token, 'content': content, 'format': cformat, 'returnFormat': returnFormat, } else: raise Exception('Not finished. Content `%s` unavailable.'%(content)) # get data response = requests.post(url, data=payload) json = response.json() # if error if 'error' in json: raise Exception('Export failed. Reason: %s.'%(json['error'])) # convert to dataframe df = pd.DataFrame(json) # save data path_= os.path.dirname(path) + "/REDCap/%s.xlsx"%(content) df.to_excel(path_, index=False) # if list isnt empty get start and end dates if ldate: ## start start = min(map(lambda x: [x[0],x[1]], ldate)) settings.console('oldest file: %s'%(start[0]), 'blue') ## end end = max(map(lambda x: [x[0],x[1]], ldate)) settings.console('newest file: %s'%(end[0]), 'blue') #----log # if new files now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') #!!! add values to go into dataframe---- row = [df.shape[0],token,content,now] # log file path fpath = "%s/%s.xlsx"%(log_path, name) # if log exists, update if os.path.exists(fpath): settings.console("Log updated @: %s"%(fpath), 'blue') #load file wb = openpyxl.load_workbook(fpath) # Select First Worksheet ws = wb.worksheets[0] # add data ws.append(row) # update wb.save(fpath) # import log log = pd.ExcelFile(fpath) # else create log else: settings.console('Log created @: %s'%(fpath), 'blue') headers = ['subjects', 'token', 'content', 'date'] # creating datafame and save as xlsx log = pd.DataFrame([row], columns=headers) log.to_excel(fpath, index=False) return log, start, end, now
[docs] @classmethod def SFTP(cls, source, destination, hostname, username, password, **kwargs): """Connect to a remote server using a Secure File Transfer Protocol (SFTP). Parameters ---------- source : :obj:`str` The directory path to retrieve paticipant data. destination : :obj:`str` The directory path to save paticipant data. hostname : :obj:`str` SFTP hostname. username : :obj:`str` SFTP username. password : :obj:`str` SFTP password. **kwargs : :obj:`str` or :obj:`None`, optional Additional properties, relevent for specific content types. Here's a list of available properties: .. list-table:: :class: kwargs :widths: 25 50 :header-rows: 1 * - Property - Description * - **filetype** : ::obj:`str` or :obj:`None` - Filetype to download. Default is csv. Returns ------- log : :obj:`pandas.DataFrame` or :obj:`None` Pandas dataframe of each download request. content : :obj:`pandas.DataFrame` or :obj:`None` Pandas dataframe of all files downloaded. start, end : :obj:`str` Timestamp (ISO format) and name of most recent (`end`) and first (`start`) file created in folder. now : :obj:`str` Current timestamp in ISO format. Examples -------- >>> name='r33'; source='/home/utweb/utw1211/public_html/r33'; d='/Users/mdl/Desktop/r33/'; un='utw1211'; pwd='43#!9amZ?K$' >>> log, start, end, now = download.SFTP(source=s, destination=d, hostname=hostname, username=un, password=pwd) """ import paramiko, openpyxl import pandas as pd #----kwargs lfiletype = kwargs["filetype"].replace('.','') if "filetype" in kwargs else ['csv'] #----constants, lists to prepare ldate = [] #list of dates file_num = 0 #file counter # bool log = None #log of events start = None #most recent file end = None #most recent file # path ## log name = Path(destination).name log_path = os.path.abspath(os.path.dirname(destination + "/../")) ## destination destination = Path(destination) settings.console('connecting to sftp', 'blue') #----make sure local path exists settings.console('local folder: %s'%(Path(destination)), 'blue') if not os.path.exists(Path(destination)): settings.console('creating local folder: %s'%(Path(destination)), 'blue') os.makedirs(Path(destination)) #----connect to SSH client client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(hostname=hostname, username=username, password=password) sftp = client.open_sftp() #----set path and get list of files sftp.chdir(source) remote_path = str(sftp.getcwd()) + '/' remote_directory = sftp.listdir() remote_attr = sftp.listdir_attr() #----check number of files in remote directory # list comprehension: <for each file in remote directory> <check if file extension matches> <and if file extension isn't empty string> remote_num = len([x for x in remote_directory if Path(x).suffix.replace('.','') in lfiletype and bool(Path(x).suffix.replace('.',''))]) try: if remote_num >= 1: settings.console('Total files in %s: %s'%(remote_path, remote_num), 'blue') else: raise Exception("Error: No %s found in folder: '%s'"%(lfiletype, remote_path)) except Exception as error: settings.console(str(error), 'red') raise #----export files settings.console("Starting download from %s"%(remote_path), 'blue') # for each file in remote directory for file in remote_attr: # file filename = file.filename filetype = Path(filename).suffix.replace('.','') # if file is correct filetype and not empty string if (filetype in lfiletype) and (bool(filetype)): # path remote_fpath = remote_path + filename local_fpath = '%s/%s'%(destination, filename) # date date = datetime.fromtimestamp(file.st_mtime).strftime('%Y-%m-%d %H:%M:%S') ldate.append([date, filename]) # if file not in local drive, download if not os.path.isfile(local_fpath): settings.console('filename: %s, type: %s, date: %s'%(filename, lfiletype, date), 'green') # counter file_num = file_num + 1 # download sftp.get(remote_fpath, local_fpath) # if list isnt empty get start and end dates if ldate: ## start start = min(map(lambda x: [x[0],x[1]], ldate)) settings.console('oldest file: %s'%(start[0]), 'blue') ## end end = max(map(lambda x: [x[0],x[1]], ldate)) settings.console('newest file: %s'%(end[0]), 'blue') #closing sftp sftp.close() #----log # if new files now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if file_num > 0: settings.console("Finished downloading files", 'blue') row = [now, file_num, remote_num, end[1]] fpath = "%s/%s.xlsx"%(log_path, name) # if log exists, update if os.path.exists(fpath): settings.console("Log updated @: %s"%(fpath), 'blue') #load file wb = openpyxl.load_workbook(fpath) # Select First Worksheet ws = wb.worksheets[0] # add data ws.append(row) # update wb.save(fpath) # import log log = pd.ExcelFile(fpath) # else create log else: settings.console('Log created @: %s'%(fpath), 'blue') headers = ['folder', 'downloaded', 'total', 'newest'] # creating datafame and save as xlsx log = pd.DataFrame([row], columns=headers) log.to_excel(fpath, index=False) else: settings.console("No files to download", 'blue') return log, start, end, now
[docs] @classmethod def SQL(cls, driver, destination, hostname, username, password, database, table, **kwargs): """ [summary] Parameters ---------- type : :obj:`str` {'MySQL','MSSQL'} The type of SQL server. Either **MySQL** or **MSSQL**. hostname : :obj:`str` The host name or IP address of the sql server. username : :obj:`str` The user name used to authenticate with the sql server. password : :obj:`str` The password to authenticate the user with the sql server. database : :obj:`str` The database name to use when connecting with the sql server. table : :obj:`str` The table name to use when connecting with the sql server. **kwargs : :obj:`str` or :obj:`None`, optional Additional properties, relevent for specific content types. Here's a list of available properties: .. list-table:: :class: kwargs :widths: 25 50 :header-rows: 1 * - Property - Description * - **port** : :obj:`int` - (report, record) The report ID number provided next to the report name on the report list page. Returns ------- connection : :obj:`mysql.connector.connect <https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysql-connector-connect.html>`__ MySQL connector instance. """ # parameters config = kwargs["config"] if "config" in kwargs else False port = kwargs["port"] if "port" in kwargs else False cnx = '' # start config = None import mysql.connector from mysql.connector import errorcode from mysql.connector.constants import ClientFlag # if configuration if config: pass else: config = { 'user': username, 'password': password, 'host': host, 'database': database, 'get_warnings': True, 'client_flags': [ClientFlag.SSL], } # if port if port: config['port'] = port try: cnx = mysql.connector.connect(**config) cur = cnx.cursor(buffered=True) except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print("Something is wrong with your user name or password") elif err.errno == errorcode.ER_BAD_DB_ERROR: print("Database does not exist") else: print(err) return cnx