#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
| `@purpose`: Download raw data from apache, Box, or REDCap servers.
| `@date`: Created on Sat May 1 15:12:38 2019
| `@author`: Semeon Risom
| `@email`: semeon.risom@gmail.com
| `@url`: https://semeon.io/d/imhr
"""
# available functions
__all__ = ['Download']
# required =
__required__ = ['openpyxl','openpyxl']
# core
from pdb import set_trace as breakpoint
# local libraries
from .. import settings
# check if psychopy is available
try:
# core
import os
from datetime import datetime
from pathlib import Path
except ImportError as e:
pkg = e.name
x = {'psychopy':'psychopy','pandas':'pandas'}
pkg = x[pkg] if pkg in x else pkg
raise Exception("No module named '%s'. Please install from PyPI before continuing."%(pkg),'red')
[docs]class Download():
"""Download raw data from apache, Box, or REDCap servers."""
@classmethod
def __init__(self, isLibrary=False):
"""Download raw data from apache, Box, or REDCap servers.
Parameters
----------
isLibrary : :obj:`bool`
Check if required libraries are available.
"""
#check libraries
if isLibrary:
settings.library(__required__)
[docs] @classmethod
def REDCap(cls, path, token, url, content, payload=None, **kwargs):
"""Download data from an Research Electronic Data Capture (REDCap) server.
Parameters
----------
path : :obj:`str`
Path to save data.
For example::
>>> path = '/Users/mdl-admin/Desktop/r33/redcap'
token : :obj:`str`
The API token specific to your REDCap project and username. This is usually found on the Applications > API page.
For example::
>>> token = 'D19859823032SFDMR24395298'
url : :obj:`str`
The URL of the REDCap project.
For example::
>>> url = 'https://redcap.prc.utexas.edu/redcap/api/'.
content : :obj:`str` {report, file, raw, arm, instrument, returnFormat, metadata, project, surveyLink, user, participantList}
Type of export. Examples include exporting a report (`report`), file (`file`), or project info (`project`).
payload_ : :obj:`dict` or :obj:`None`, optional
Manually submit parameters for exporting or importing data. Can be entered within the function for convenience.
**kwargs : :obj:`str` or :obj:`None`, optional
Additional properties, relevent for specific content types. Here's a list of available properties:
.. list-table::
:class: kwargs
:widths: 25 50
:header-rows: 1
* - Property
- Description
* - **report_id** : :obj:`str`
- (report, record) The report ID number provided next to the report name on the report list page.
* - **cformat** : :obj:`str` {csv, json, xml, odm}
- Format to return data, either csv, json, xml, or odm. Default is json.
* - **ctype** : :obj:`str`
- Shape of data. Default is flat.
* - **rawOrLabel**: :obj:`str` {raw, label}
- (report, record) TExport the raw coded values or labels for the options of multiple choice fields.
* - **rawOrLabelHeaders**: :obj:`str`
- (report, record) TExport the variable/field names (raw) or the field labels (label).
* - **exportCheckboxLabel**: :obj:`str`
- Specifies the format of checkbox field values specifically when exporting the data as labels (i.e., when rawOrLabel=label).
* - **returnFormat** : :obj:`str`
- Format to return errors. Default is `json`.
Returns
-------
log : :obj:`pandas.DataFrame` or :obj:`None`
Pandas dataframe of each download request.
content : :obj:`pandas.DataFrame` or :obj:`None`
Pandas dataframe of all data downloaded.
start, end : :obj:`str`
Timestamp (ISO format) and name of most recent (`end`) and first (`start`) file created in folder.
now : :obj:`str`
Current timestamp in ISO format.
"""
import openpyxl, requests
import pandas as pd
#----constants, lists to prepare
ldate = [] #list of dates
file_num = 0 #file counter
# bool
log = None #log of events
start = None #most recent file
end = None #most recent file
#----path
# log
name = Path(path).name
log_path = os.path.abspath(os.path.dirname(path + "/../"))
# destination
path = Path(path)
#----kwargs
# general
cformat = kwargs['cformat'] if "cformat" in kwargs else 'json'
ctype = kwargs['ctype'] if "ctype" in kwargs else 'flat'
rawOrLabel = kwargs['rawOrLabel'] if "rawOrLabel" in kwargs else 'raw'
rawOrLabelHeaders = kwargs['rawOrLabelHeaders'] if "rawOrLabelHeaders" in kwargs else 'raw'
exportCheckboxLabel = kwargs['exportCheckboxLabel'] if "exportCheckboxLabel" in kwargs else 'false'
returnFormat = kwargs['returnFormat'] if "returnFormat" in kwargs else 'json'
# report
report_id = kwargs['report_id'] if "report_id" in kwargs else 'None'
# participantList
instrument = kwargs['instrument'] if "instrument" in kwargs else 'cssrs'
event = kwargs['event'] if "event" in kwargs else 'online_eligibility_arm_1'
#----start
settings.console('connecting to REDCap', 'blue')
#----make sure local path exists
settings.console('local folder: %s'%(Path(path)), 'blue')
if not os.path.exists(Path(path)):
settings.console('creating local folder: %s'%(Path(path)), 'blue')
os.makedirs(Path(path))
#----start download
# prepare
if payload == None:
# report
if content == 'report':
payload = {
'token': token,
'content': content,
'format': cformat,
'type': ctype,
'returnFormat': returnFormat,
'report_id': report_id,
'rawOrLabel': rawOrLabel,
'rawOrLabelHeaders': rawOrLabelHeaders,
'exportCheckboxLabel': exportCheckboxLabel
}
elif content == 'participantList':
payload = {
'token': token,
'content': content,
'format': cformat,
'returnFormat': returnFormat,
'instrument': instrument,
'event': event,
}
elif content == 'metadata':
payload = {
'token': token,
'content': content,
'format': cformat,
'returnFormat': returnFormat,
}
elif content == 'project':
payload = {
'token': token,
'content': content,
'format': cformat,
'returnFormat': returnFormat,
}
else:
raise Exception('Not finished. Content `%s` unavailable.'%(content))
# get data
response = requests.post(url, data=payload)
json = response.json()
# if error
if 'error' in json:
raise Exception('Export failed. Reason: %s.'%(json['error']))
# convert to dataframe
df = pd.DataFrame(json)
# save data
path_= os.path.dirname(path) + "/REDCap/%s.xlsx"%(content)
df.to_excel(path_, index=False)
# if list isnt empty get start and end dates
if ldate:
## start
start = min(map(lambda x: [x[0],x[1]], ldate))
settings.console('oldest file: %s'%(start[0]), 'blue')
## end
end = max(map(lambda x: [x[0],x[1]], ldate))
settings.console('newest file: %s'%(end[0]), 'blue')
#----log
# if new files
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#!!! add values to go into dataframe----
row = [df.shape[0],token,content,now]
# log file path
fpath = "%s/%s.xlsx"%(log_path, name)
# if log exists, update
if os.path.exists(fpath):
settings.console("Log updated @: %s"%(fpath), 'blue')
#load file
wb = openpyxl.load_workbook(fpath)
# Select First Worksheet
ws = wb.worksheets[0]
# add data
ws.append(row)
# update
wb.save(fpath)
# import log
log = pd.ExcelFile(fpath)
# else create log
else:
settings.console('Log created @: %s'%(fpath), 'blue')
headers = ['subjects', 'token', 'content', 'date']
# creating datafame and save as xlsx
log = pd.DataFrame([row], columns=headers)
log.to_excel(fpath, index=False)
return log, start, end, now
[docs] @classmethod
def SFTP(cls, source, destination, hostname, username, password, **kwargs):
"""Connect to a remote server using a Secure File Transfer Protocol (SFTP).
Parameters
----------
source : :obj:`str`
The directory path to retrieve paticipant data.
destination : :obj:`str`
The directory path to save paticipant data.
hostname : :obj:`str`
SFTP hostname.
username : :obj:`str`
SFTP username.
password : :obj:`str`
SFTP password.
**kwargs : :obj:`str` or :obj:`None`, optional
Additional properties, relevent for specific content types. Here's a list of available properties:
.. list-table::
:class: kwargs
:widths: 25 50
:header-rows: 1
* - Property
- Description
* - **filetype** : ::obj:`str` or :obj:`None`
- Filetype to download. Default is csv.
Returns
-------
log : :obj:`pandas.DataFrame` or :obj:`None`
Pandas dataframe of each download request.
content : :obj:`pandas.DataFrame` or :obj:`None`
Pandas dataframe of all files downloaded.
start, end : :obj:`str`
Timestamp (ISO format) and name of most recent (`end`) and first (`start`) file created in folder.
now : :obj:`str`
Current timestamp in ISO format.
Examples
--------
>>> name='r33'; source='/home/utweb/utw1211/public_html/r33'; d='/Users/mdl/Desktop/r33/'; un='utw1211'; pwd='43#!9amZ?K$'
>>> log, start, end, now = download.SFTP(source=s, destination=d, hostname=hostname, username=un, password=pwd)
"""
import paramiko, openpyxl
import pandas as pd
#----kwargs
lfiletype = kwargs["filetype"].replace('.','') if "filetype" in kwargs else ['csv']
#----constants, lists to prepare
ldate = [] #list of dates
file_num = 0 #file counter
# bool
log = None #log of events
start = None #most recent file
end = None #most recent file
# path
## log
name = Path(destination).name
log_path = os.path.abspath(os.path.dirname(destination + "/../"))
## destination
destination = Path(destination)
settings.console('connecting to sftp', 'blue')
#----make sure local path exists
settings.console('local folder: %s'%(Path(destination)), 'blue')
if not os.path.exists(Path(destination)):
settings.console('creating local folder: %s'%(Path(destination)), 'blue')
os.makedirs(Path(destination))
#----connect to SSH client
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=hostname, username=username, password=password)
sftp = client.open_sftp()
#----set path and get list of files
sftp.chdir(source)
remote_path = str(sftp.getcwd()) + '/'
remote_directory = sftp.listdir()
remote_attr = sftp.listdir_attr()
#----check number of files in remote directory
# list comprehension: <for each file in remote directory> <check if file extension matches> <and if file extension isn't empty string>
remote_num = len([x for x in remote_directory if Path(x).suffix.replace('.','') in lfiletype and bool(Path(x).suffix.replace('.',''))])
try:
if remote_num >= 1:
settings.console('Total files in %s: %s'%(remote_path, remote_num), 'blue')
else:
raise Exception("Error: No %s found in folder: '%s'"%(lfiletype, remote_path))
except Exception as error:
settings.console(str(error), 'red')
raise
#----export files
settings.console("Starting download from %s"%(remote_path), 'blue')
# for each file in remote directory
for file in remote_attr:
# file
filename = file.filename
filetype = Path(filename).suffix.replace('.','')
# if file is correct filetype and not empty string
if (filetype in lfiletype) and (bool(filetype)):
# path
remote_fpath = remote_path + filename
local_fpath = '%s/%s'%(destination, filename)
# date
date = datetime.fromtimestamp(file.st_mtime).strftime('%Y-%m-%d %H:%M:%S')
ldate.append([date, filename])
# if file not in local drive, download
if not os.path.isfile(local_fpath):
settings.console('filename: %s, type: %s, date: %s'%(filename, lfiletype, date), 'green')
# counter
file_num = file_num + 1
# download
sftp.get(remote_fpath, local_fpath)
# if list isnt empty get start and end dates
if ldate:
## start
start = min(map(lambda x: [x[0],x[1]], ldate))
settings.console('oldest file: %s'%(start[0]), 'blue')
## end
end = max(map(lambda x: [x[0],x[1]], ldate))
settings.console('newest file: %s'%(end[0]), 'blue')
#closing sftp
sftp.close()
#----log
# if new files
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
if file_num > 0:
settings.console("Finished downloading files", 'blue')
row = [now, file_num, remote_num, end[1]]
fpath = "%s/%s.xlsx"%(log_path, name)
# if log exists, update
if os.path.exists(fpath):
settings.console("Log updated @: %s"%(fpath), 'blue')
#load file
wb = openpyxl.load_workbook(fpath)
# Select First Worksheet
ws = wb.worksheets[0]
# add data
ws.append(row)
# update
wb.save(fpath)
# import log
log = pd.ExcelFile(fpath)
# else create log
else:
settings.console('Log created @: %s'%(fpath), 'blue')
headers = ['folder', 'downloaded', 'total', 'newest']
# creating datafame and save as xlsx
log = pd.DataFrame([row], columns=headers)
log.to_excel(fpath, index=False)
else:
settings.console("No files to download", 'blue')
return log, start, end, now
[docs] @classmethod
def SQL(cls, driver, destination, hostname, username, password, database, table, **kwargs):
"""
[summary]
Parameters
----------
type : :obj:`str` {'MySQL','MSSQL'}
The type of SQL server. Either **MySQL** or **MSSQL**.
hostname : :obj:`str`
The host name or IP address of the sql server.
username : :obj:`str`
The user name used to authenticate with the sql server.
password : :obj:`str`
The password to authenticate the user with the sql server.
database : :obj:`str`
The database name to use when connecting with the sql server.
table : :obj:`str`
The table name to use when connecting with the sql server.
**kwargs : :obj:`str` or :obj:`None`, optional
Additional properties, relevent for specific content types. Here's a list of available properties:
.. list-table::
:class: kwargs
:widths: 25 50
:header-rows: 1
* - Property
- Description
* - **port** : :obj:`int`
- (report, record) The report ID number provided next to the report name on the report list page.
Returns
-------
connection : :obj:`mysql.connector.connect <https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysql-connector-connect.html>`__
MySQL connector instance.
"""
# parameters
config = kwargs["config"] if "config" in kwargs else False
port = kwargs["port"] if "port" in kwargs else False
cnx = ''
# start
config = None
import mysql.connector
from mysql.connector import errorcode
from mysql.connector.constants import ClientFlag
# if configuration
if config:
pass
else:
config = {
'user': username,
'password': password,
'host': host,
'database': database,
'get_warnings': True,
'client_flags': [ClientFlag.SSL],
}
# if port
if port:
config['port'] = port
try:
cnx = mysql.connector.connect(**config)
cur = cnx.cursor(buffered=True)
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
return cnx