Source code for imhr.data.download

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
| `@purpose`: Download raw data from apache, Box, or REDCap servers.  
| `@date`: Created on Sat May 1 15:12:38 2019  
| `@author`: Semeon Risom  
| `@email`: semeon.risom@gmail.com  
| `@url`: https://semeon.io/d/imhr
"""

# available functions
__all__ = ['Download']

# required = 
__required__ = ['openpyxl','openpyxl']
# core
from pdb import set_trace as breakpoint

# local libraries
from .. import settings

# check if psychopy is available
try:
	# core
	import os
	from datetime import datetime
	from pathlib import Path
except ImportError as e:
	pkg = e.name
	x = {'psychopy':'psychopy','pandas':'pandas'}
	pkg = x[pkg] if pkg in x else pkg
	raise Exception("No module named '%s'. Please install from PyPI before continuing."%(pkg),'red')

[docs]class Download():
	"""Download raw data from apache, Box, or REDCap servers."""
	@classmethod
	def __init__(self, isLibrary=False):
		"""Download raw data from apache, Box, or REDCap servers.

		Parameters
		----------
		isLibrary : :obj:`bool`
			Check if required libraries are available.
		"""
		#check libraries
		if isLibrary:
			settings.library(__required__)

[docs]	@classmethod
	def REDCap(cls, path, token, url, content, payload=None, **kwargs):
		"""Download data from an Research Electronic Data Capture (REDCap) server.

		Parameters
		----------
		path : :obj:`str`
			Path to save data.
			For example::
				>>> path = '/Users/mdl-admin/Desktop/r33/redcap'
		token : :obj:`str`
			The API token specific to your REDCap project and username. This is usually found on the Applications > API page.
			For example::
				>>> token = 'D19859823032SFDMR24395298'
		url : :obj:`str`
			The URL of the REDCap project.
			For example::
				>>> url = 'https://redcap.prc.utexas.edu/redcap/api/'.
		content : :obj:`str` {report, file, raw, arm, instrument, returnFormat, metadata, project, surveyLink, user, participantList}
			Type of export. Examples include exporting a report (`report`), file (`file`), or project info (`project`).
		payload_ : :obj:`dict` or :obj:`None`, optional
			Manually submit parameters for exporting or importing data. Can be entered within the function for convenience.
		**kwargs : :obj:`str` or :obj:`None`, optional
			Additional properties, relevent for specific content types. Here's a list of available properties:

			.. list-table::
				:class: kwargs
				:widths: 25 50
				:header-rows: 1

				* - Property
				  - Description
				* - **report_id** : :obj:`str`
				  - (report, record) The report ID number provided next to the report name on the report list page.
				* - **cformat** : :obj:`str` {csv, json, xml, odm}
				  - Format to return data, either csv, json, xml, or odm. Default is json.
				* - **ctype** : :obj:`str`
				  - Shape of data. Default is flat.
				* - **rawOrLabel**: :obj:`str` {raw, label}
				  - (report, record) TExport the raw coded values or labels for the options of multiple choice fields.
				* - **rawOrLabelHeaders**: :obj:`str`
				  - (report, record) TExport the variable/field names (raw) or the field labels (label).
				* - **exportCheckboxLabel**: :obj:`str`
				  - Specifies the format of checkbox field values specifically when exporting the data as labels (i.e., when rawOrLabel=label).
				* - **returnFormat** : :obj:`str`
				  -  Format to return errors. Default is `json`.

		Returns
		-------
		log : :obj:`pandas.DataFrame` or :obj:`None`
			Pandas dataframe of each download request.
		content : :obj:`pandas.DataFrame` or :obj:`None`
			Pandas dataframe of all data downloaded.
		start, end : :obj:`str`
			Timestamp (ISO format) and name of most recent (`end`) and first (`start`) file created in folder.
		now : :obj:`str`
			Current timestamp in ISO format.

		"""
		import openpyxl, requests
		import pandas as pd

		#----constants, lists to prepare
		ldate = [] #list of dates
		file_num = 0 #file counter
		# bool
		log = None #log of events
		start = None #most recent file
		end = None #most recent file

		#----path
		# log
		name = Path(path).name
		log_path = os.path.abspath(os.path.dirname(path + "/../"))

		# destination
		path = Path(path)

		#----kwargs
		# general
		cformat = kwargs['cformat'] if "cformat" in kwargs else 'json'
		ctype = kwargs['ctype'] if "ctype" in kwargs else 'flat'
		rawOrLabel = kwargs['rawOrLabel'] if "rawOrLabel" in kwargs else 'raw'
		rawOrLabelHeaders = kwargs['rawOrLabelHeaders'] if "rawOrLabelHeaders" in kwargs else 'raw'
		exportCheckboxLabel = kwargs['exportCheckboxLabel'] if "exportCheckboxLabel" in kwargs else 'false'
		returnFormat = kwargs['returnFormat'] if "returnFormat" in kwargs else 'json'
		# report
		report_id = kwargs['report_id'] if "report_id" in kwargs else 'None'
		# participantList
		instrument = kwargs['instrument'] if "instrument" in kwargs else 'cssrs'
		event = kwargs['event'] if "event" in kwargs else 'online_eligibility_arm_1'

		#----start
		settings.console('connecting to REDCap', 'blue')

		#----make sure local path exists
		settings.console('local folder: %s'%(Path(path)), 'blue')
		if not os.path.exists(Path(path)):
			settings.console('creating local folder: %s'%(Path(path)), 'blue')
			os.makedirs(Path(path))

		#----start download
		# prepare
		if payload == None:
			# report
			if content == 'report':
				payload = {
					'token': token,
					'content': content,
					'format': cformat,
					'type': ctype,
					'returnFormat': returnFormat,
					'report_id': report_id,
					'rawOrLabel': rawOrLabel,
					'rawOrLabelHeaders': rawOrLabelHeaders,
					'exportCheckboxLabel': exportCheckboxLabel
				}
			elif content == 'participantList':
				payload = {
					'token': token,
					'content': content,
					'format': cformat,
					'returnFormat': returnFormat,
					'instrument': instrument,
					'event': event,
				}
			elif content == 'metadata':
				payload = {
					'token': token,
					'content': content,
					'format': cformat,
					'returnFormat': returnFormat,
				}
			elif content == 'project':
				payload = {
					'token': token,
					'content': content,
					'format': cformat,
					'returnFormat': returnFormat,
				}
			else:
				raise Exception('Not finished. Content `%s` unavailable.'%(content))

		# get data
		response = requests.post(url, data=payload)
		json = response.json()
		# if error
		if 'error' in json:
			raise Exception('Export failed. Reason: %s.'%(json['error']))

		# convert to dataframe
		df = pd.DataFrame(json)

		# save data
		path_= os.path.dirname(path) + "/REDCap/%s.xlsx"%(content)
		df.to_excel(path_, index=False)

		# if list isnt empty get start and end dates
		if ldate:
			## start
			start = min(map(lambda x: [x[0],x[1]], ldate))
			settings.console('oldest file: %s'%(start[0]), 'blue')
			## end
			end = max(map(lambda x: [x[0],x[1]], ldate))
			settings.console('newest file: %s'%(end[0]), 'blue')

		#----log
		# if new files
		now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

		#!!! add values to go into dataframe----
		row = [df.shape[0],token,content,now]
		# log file path
		fpath = "%s/%s.xlsx"%(log_path, name)

		# if log exists, update
		if os.path.exists(fpath):
			settings.console("Log updated @: %s"%(fpath), 'blue')
			#load file
			wb = openpyxl.load_workbook(fpath)
			# Select First Worksheet
			ws = wb.worksheets[0]
			# add data
			ws.append(row)
			# update
			wb.save(fpath)
			# import log
			log = pd.ExcelFile(fpath)
		# else create log
		else:
			settings.console('Log created @: %s'%(fpath), 'blue')
			headers = ['subjects', 'token', 'content', 'date']
			# creating datafame and save as xlsx
			log = pd.DataFrame([row], columns=headers)
			log.to_excel(fpath, index=False)

		return log, start, end, now

[docs]	@classmethod
	def SFTP(cls, source, destination, hostname, username, password, **kwargs):
		"""Connect to a remote server using a Secure File Transfer Protocol (SFTP).

		Parameters
		----------
		source : :obj:`str`
			The directory path to retrieve paticipant data.
		destination : :obj:`str`
			The directory path to save paticipant data.
		hostname : :obj:`str`
			SFTP hostname.
		username : :obj:`str`
			SFTP username.
		password : :obj:`str`
			SFTP password.
		**kwargs : :obj:`str` or :obj:`None`, optional
			Additional properties, relevent for specific content types. Here's a list of available properties:

			.. list-table::
				:class: kwargs
				:widths: 25 50
				:header-rows: 1

				* - Property
				  - Description
				* - **filetype** : ::obj:`str` or :obj:`None`
				  - Filetype to download. Default is csv.

		Returns
		-------
		log : :obj:`pandas.DataFrame` or :obj:`None`
			Pandas dataframe of each download request.
		content : :obj:`pandas.DataFrame` or :obj:`None`
			Pandas dataframe of all files downloaded.
		start, end : :obj:`str`
			Timestamp (ISO format) and name of most recent (`end`) and first (`start`) file created in folder.
		now : :obj:`str`
			Current timestamp in ISO format.

		Examples
		--------
		>>> name='r33'; source='/home/utweb/utw1211/public_html/r33'; d='/Users/mdl/Desktop/r33/'; un='utw1211'; pwd='43#!9amZ?K$'
		>>> log, start, end, now = download.SFTP(source=s, destination=d, hostname=hostname, username=un, password=pwd)

		"""
		import paramiko, openpyxl
		import pandas as pd

		#----kwargs
		lfiletype = kwargs["filetype"].replace('.','') if "filetype" in kwargs else ['csv']

		#----constants, lists to prepare
		ldate = [] #list of dates
		file_num = 0 #file counter
		# bool
		log = None #log of events
		start = None #most recent file
		end = None #most recent file
		# path

		## log
		name = Path(destination).name
		log_path = os.path.abspath(os.path.dirname(destination + "/../"))
		## destination
		destination = Path(destination)

		settings.console('connecting to sftp', 'blue')

		#----make sure local path exists
		settings.console('local folder: %s'%(Path(destination)), 'blue')
		if not os.path.exists(Path(destination)):
			settings.console('creating local folder: %s'%(Path(destination)), 'blue')
			os.makedirs(Path(destination))

		#----connect to SSH client
		client = paramiko.SSHClient()
		client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
		client.connect(hostname=hostname, username=username, password=password)
		sftp = client.open_sftp()

		#----set path and get list of files
		sftp.chdir(source)
		remote_path = str(sftp.getcwd()) + '/'
		remote_directory = sftp.listdir()
		remote_attr = sftp.listdir_attr()

		#----check number of files in remote directory
		# list comprehension: <for each file in remote directory> <check if file extension matches> <and if file extension isn't empty string>
		remote_num = len([x for x in remote_directory if Path(x).suffix.replace('.','') in lfiletype and bool(Path(x).suffix.replace('.',''))])
		try:
			if remote_num >= 1:
				settings.console('Total files in %s: %s'%(remote_path, remote_num), 'blue')
			else:
				raise Exception("Error: No %s found in folder: '%s'"%(lfiletype, remote_path))
		except Exception as error:
			settings.console(str(error), 'red')
			raise

		#----export files
		settings.console("Starting download from %s"%(remote_path), 'blue')
		# for each file in remote directory
		for file in remote_attr:
			# file
			filename = file.filename
			filetype = Path(filename).suffix.replace('.','')

			# if file is correct filetype and not empty string
			if (filetype in lfiletype) and (bool(filetype)):
				# path
				remote_fpath = remote_path + filename
				local_fpath = '%s/%s'%(destination, filename)
				# date
				date = datetime.fromtimestamp(file.st_mtime).strftime('%Y-%m-%d %H:%M:%S')
				ldate.append([date, filename])

				# if file not in local drive, download
				if not os.path.isfile(local_fpath):
					settings.console('filename: %s, type: %s, date: %s'%(filename, lfiletype, date), 'green')
					# counter
					file_num = file_num + 1
					# download
					sftp.get(remote_fpath, local_fpath)

		# if list isnt empty get start and end dates
		if ldate:
			## start
			start = min(map(lambda x: [x[0],x[1]], ldate))
			settings.console('oldest file: %s'%(start[0]), 'blue')
			## end
			end = max(map(lambda x: [x[0],x[1]], ldate))
			settings.console('newest file: %s'%(end[0]), 'blue')

		#closing sftp
		sftp.close()

		#----log
		# if new files
		now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
		if file_num > 0:
			settings.console("Finished downloading files", 'blue')
			row = [now, file_num, remote_num, end[1]]
			fpath = "%s/%s.xlsx"%(log_path, name)
			# if log exists, update
			if os.path.exists(fpath):
				settings.console("Log updated @: %s"%(fpath), 'blue')
				#load file
				wb = openpyxl.load_workbook(fpath)
				# Select First Worksheet
				ws = wb.worksheets[0]
				# add data
				ws.append(row)
				# update
				wb.save(fpath)
				# import log
				log = pd.ExcelFile(fpath)
			# else create log
			else:
				settings.console('Log created @: %s'%(fpath), 'blue')
				headers = ['folder', 'downloaded', 'total', 'newest']
				# creating datafame and save as xlsx
				log = pd.DataFrame([row], columns=headers)
				log.to_excel(fpath, index=False)
		else:
			settings.console("No files to download", 'blue')

		return log, start, end, now

[docs]	@classmethod
	def SQL(cls, driver, destination, hostname, username, password, database, table, **kwargs):
		"""
		[summary]

		Parameters
		----------
		type : :obj:`str` {'MySQL','MSSQL'}
			The type of SQL server. Either **MySQL** or **MSSQL**.
		hostname : :obj:`str`
			The host name or IP address of the sql server.
		username : :obj:`str`
			The user name used to authenticate with the sql server.
		password : :obj:`str`
			The password to authenticate the user with the sql server.
		database : :obj:`str`
			The database name to use when connecting with the sql server.
		table : :obj:`str`
			The table name to use when connecting with the sql server.
		**kwargs : :obj:`str` or :obj:`None`, optional
			Additional properties, relevent for specific content types. Here's a list of available properties:

			.. list-table::
				:class: kwargs
				:widths: 25 50
				:header-rows: 1

				* - Property
				  - Description
				* - **port** : :obj:`int`
				  - (report, record) The report ID number provided next to the report name on the report list page.

		Returns
		-------
		connection : :obj:`mysql.connector.connect <https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysql-connector-connect.html>`__
			MySQL connector instance.
		"""
		# parameters
		config = kwargs["config"] if "config" in kwargs else False
		port = kwargs["port"] if "port" in kwargs else False
		cnx = ''

		# start
		config = None
		import mysql.connector
		from mysql.connector import errorcode
		from mysql.connector.constants import ClientFlag

		# if configuration
		if config:
			pass
		else:
			config = {
			    'user': username,
			    'password': password,
			    'host': host,
				'database': database,
				'get_warnings': True,
			    'client_flags': [ClientFlag.SSL],
			}
			# if port
			if port:
				config['port'] = port
		try:
			cnx = mysql.connector.connect(**config)
			cur = cnx.cursor(buffered=True)
		except mysql.connector.Error as err:
		  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
		    print("Something is wrong with your user name or password")
		  elif err.errno == errorcode.ER_BAD_DB_ERROR:
		    print("Database does not exist")
		  else:
		    print(err)

		return cnx