Source code for imhr.Webgazer.model

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
| @purpose: Build models for analysis.   
| @date: Created on Sat May 1 15:12:38 2019
| @author: Semeon Risom
| @email: semeon.risom@gmail.com
| @url: https://semeon.io/d/R33-analysis
"""

# global
from pdb import set_trace as breakpoint
import os, re
import datetime

# local
from . import plot
from .. import settings

# constants
console = settings.console
debug = settings.debug
stn = settings.stn
popover = settings.popover
link = settings.link

# rpy2
##set path
#os.environ["R_HOME"] = "/Library/Frameworks/R.framework/Versions/Current/Resources/"

#-------Notes
# Resources:
# - https://stats.idre.ucla.edu/other/mult-pkg/whatstat/

__all__ = ['anova','logistic','lmer']

[docs]def anova(config, y, f, df, csv, path, effects, is_html=True):
    """Run analysis of variance model using rpy2, seaborn and pandas.

    Parameters
    ----------
    y : :obj:`str`
        Response variable.
    f : :obj:`str`
        Formula to use for analysis.
    df : :class:`pandas.core.frame.DataFrame`
        Pandas dataframe of raw data.
    f : :obj:`str`
        R-compatiable formula.
    csv : :obj:`str`
        Name of generated CSV file to run analysis in R.
    path : :obj:`str`
        The directory path to save the generated files.
    effects : :obj:`list`
        List of main effects.
    is_html : :obj:`bool`
        Whether html should be generated.
    
    Returns
    -------
    model : :class:`rpy2.robjects.methods.RS4`
        Python representation of an R instance of class 'S4'.
    df_anova : :class:`pandas.core.frame.DataFrame`
        Pandas dataframe of model output.
    get_anova : :class:`str`
        R script to run model.
    html : :class:`str
        Html output.
    
    Notes
    -----
    **Resources**
        - https://rpsychologist.com/r-guide-longitudinal-lme-lmer
        - https://sites.ualberta.ca/~lkgray/uploads/7/3/6/2/7362679/slides_- anova_assumptions.pdf
        - https://rpubs.com/tmcurley/twowayanova
        - https://rstudio-pubs-static.s3.amazonaws.com/158708_78d414c091fc47bd99f6f75e3bd8f4cb.html
        - https://m-clark.github.io/docs/mixedModels/anovamixed.html
        - http://dwoll.de/rexrepos/posts/anovaMixed.html
        - https://stats.stackexchange.com/questions/247582/repeated-measures-anova-in-r-errorsubject-vs-errorsubject-day
        - https://cran.r-project.org/web/packages/afex/vignettes/afex_anova_example.html#post-hoc-contrasts-and-plotting
        - http://www.let.rug.nl/nerbonne/teach/rema-stats-meth-seminar/presentations/Wieling-MixedModels-2011.pdf
    
    **Definition**:
        A test that allows one to make comparisons between the means of multiple groups of data, where two independent variables are considered. 
    
    **Assumptions of ANOVA**
        #. Normal distribution (normality)
            - Short: Samples are drawn from a normally distributed population (Q-Q Plot, Shapiro-Wilks Test)
            - Detailed Definition: Residuals in data are normally distributed.
        #. Homogeneity of variance (homoscedasticity)
            - Short: Variances are equal (or similar).
            - Detailed: Varience for a DV is constant across the sample. (residual vs fitted plot, Scale-Location plot, Levene's test)
        #. Independent observations
            - Samples have been drawn independently of each other. No analysis needed.
        
    **Hypothesis Interpretation**
        - **Null**: The means of all levels of an IV groups are equal.
        - **Alternative**: The mean of at least level of an IV is different.
    """
    from rpy2.robjects import pandas2ri, r
    pandas2ri.activate()
    
	#----for timestamp
    _t0 = datetime.datetime.now()
    _f = debug(message='t', source="timestamp")
    console('model.anova(%s)'%(y), 'green')
    
    #----metadata
    source = "anova"
    
    #---------check if paths exist
    for path_ in [path, path+"/csv/", path+"/img/"]:
        if not os.path.exists(path_):
            os.makedirs(path_)
    
    #---------save data for access by R
    df.to_csv(path + "/csv/" + csv, index=None)

    #---------get number of remaining subjects (datapoints) used
    subject_anova = df.drop_duplicates(subset="participant", keep="first").shape[0]
    
    #---------run r
    #code
    get_anova = '\n'.join([
        '#!/usr/bin/env Rscript3.5.1',
        'rm(list=ls());',
        '#----library',
        '# core',
        'library(tidyverse);',
        'library(broom);',
        '# analysis',
        'library(lme4); library(lmerTest);',
        '# plot',
        'library(ggplot2);',
        '# estimated marginal means',
        'library(emmeans); library(multcomp);',
        '\t' + '',
        "#----repeated measures anova model", 
        'anova_ <- function(){',
        '\t' + "#----load data",
        '\t' + 'path <- "%s"'%(path + "csv/"),
        '\t' + "df <- read.csv(file=file.path(path, '%s'), header=TRUE)"%(csv),
        '\t' + '',
        '\t' + '#----set type',
        '\t' + 'df$trialType <- factor(df$trialType)',
        '\t' + 'df$cesd_group <- factor(df$cesd_group)',
        '\t' + 'df$participant <- factor(df$participant)',
        '\t' + '',
        '\t' + '#----run model',
        '\t' + 'lmer_ <- lmerTest::lmer(%s, data=df)'%(f),
        '\t' + 'aov_ <- stats::anova(lmer_)',
        '\t' + '',
        '\t' + '#----Estimated Marginal Means (Least-Squares Means) of all factor levels',
        '\t' + 'pair_ <- emmeans::emmeans(lmer_, c(%s),'%(','.join("'{0}'".format(x) for x in effects['main'].keys())),
        '\t' + '                          type = "response", adjust = "tukey")',
        '\t' + '# Comparison of Differences Between Levels of Factor',
        '\t' + 'lsmeans_ <- summary(emmeans::as.glht(pairs(pair_)), test=adjusted("free")) %>% ',
        '\t' + '   broom::tidy()',
        '\t' + '',
        '\t' + '#----getting x,y coordinates for qqplot',
        '\t' + '# create plot',
        '\t' + 'gg <- ggplot(lmer_) + ',
        '\t' + '  ggplot2::stat_qq(aes(sample = .resid, colour = factor(trialType))) + ',
        '\t' + '  ggplot2::geom_abline(linetype = "dotted")',
        '\t' + '',
        '\t' + '# convert to tibble',
        '\t' + 'gg <- ggplot_build(gg)[["data"]][[1]] %>% ',
        '\t' + '  dplyr::select(sample, theoretical)',
        '\t' + '',
        '\t' + '#----creating residuals tibble from model',
        '\t' + '# including raw data, residuals vs fitted)',
        '\t' + '# .resid=residuals, .fitted=predicted values .estimate=estimate of fixed effect',
        '\t' + 'residuals <- broom::augment(lmer_) %>% ',
        '\t' + '   dplyr::select(%s, .resid, .fitted)'%('participant' + ', ' + y + ', ' + ', '.join(effects['main'].keys())),
        '\t' + '',
        '\t' + '#----merge residuals and qq tibble',
        '\t' + 'residuals <- as.data.frame(merge(x=residuals, y=gg, by.x=".resid", by.y="sample", all.x = TRUE))',
        '\t' + '',
        '\t' + '#----prepare data for export',
        '\t' + '# convert model output to tibble',
        '\t' + 'output <- broom::tidy(aov_)',
        '\t' + '# convert model summary to tibble',
        '\t' + 'summary <- broom::glance(lmer_)',
        '\t' + '',
        '\t' + '#----return model output, model, residuals, lsmeans, model summary',
        '\t' + 'return(list(output, lmer_, residuals, lsmeans_, summary))',
    '}'])
    # load r function
    anova_r = r(get_anova)
    # run
    df_r = anova_r()
    
    #----extract df and model from rpy2
    df_anova = df_r[0]
    model = df_r[1]
    residuals = df_r[2]
    lsmeans = df_r[3]
    summary = df_r[4]
    
    #----clean data
    #rename
    df_anova = df_anova.rename(columns={'sumsq':'SS','meansq':'MS','statistic':'f','p.value':'Pr(>|f|)','DenDF':'DF','NumDF':'N'})
    # round p-value
    df_anova['Pr(>|f|)'] = df_anova[['Pr(>|f|)']].apply(lambda x: x.dropna().round(4).astype(str))
    #rename columns
    df_anova = df_anova.rename_axis("index", axis="columns")
    
    #----format summary
    summary = summary.rename_axis("index", axis="columns")
    summary = summary.to_html(index=True, index_names=True).replace('<table border="1" class="dataframe">',
    '<table id="table1" class="table '+source+' table-striped table-bordered hover dt-responsive nowrap"\ cellspacing="0" width="100%">')
        
    #----prepare metadata
    short_ = config['metadata']['short']
    long_ = config['metadata']['long']
    def_ = config['metadata']['def']
    cite_ = config['metadata']['cite']
    url_ = config['metadata']['url']
    var_ = config['metadata']['var']
    img_ = config['metadata']['img']
    
    #---------title, footnote, results
    title = '<b>Table 1.</b> Repeated Measures ANOVA for %s (<i>N</i> = %s).'%(long_[y], subject_anova)
    #----description
    description = ''.join([
        "<p><b>Type III Analysis of Variance Table with Satterthwaite's method</b> \
        [%a]. "%(link(name='anova', url='https://www.rdocumentation.org/packages/stats/versions/3.5.3/topics/anova')),
        "<div class='paragraph'>",
            "<div>The assumptions for the model are:</div>",
            '<ul class="number-list">',
                '<li>%s (%s).</li>'
                %(popover(name=long_['nd'],title=long_['nd'],description=def_['nd']),
                  link(name='Q-Q Plot', url='#qq')),
                '<li>%s (%s).</li>'
                %(popover(name=long_['hv'],title=long_['hv'],description=def_['hv']),
                  link(name='Residual vs Fitted Plot', url='#rf')),
                '<li>%s</li>'
                %(popover(name=long_['io'],title=long_['io'],description=def_['io'])),
            '</ul>',
        '</div>'
        "<div class='paragraph'>",
            "<div>The following post-hoc analysis were run:</div>",
            '<ul class="number-list">',
                '<li>%s.</li>'
                %(link(name='Estimated Marginal Means', url='#lsmean')),
                '<li>%s.</li>'
                %(link(name='Pairwise Comparison Plot', url='#boxplot')),
                '<li>%s.</li>'
                %(link(name='Q-Q Plot', url='#qq')),
                "<li>%s.</li>"
                %(link(name='Residual vs Fitted Plot', url='#rf')),
            '</ul>',
        '</div>'
    ])
    
    #----results
    results = [
        '<div class="subtitle">Statistical Analysis</div>',
        "<p>",
        def_['exclude'],
        'We employed a repeated measures ANOVA using the anova() function in the <i>stats</i> R package \
        (%s; %s).'%(\
        popover(name=long_['R'], title=long_['R'], description=cite_['R']),
        popover(name=long_['anova'], title=long_['anova'], description=cite_['anova']))
    ]
        
    #list of main effects    
    #"The fixed effects was %s,"%(def_['cesd_group'])
    if effects['main'] != None:
        #create qualifier #check if single random effect or multiple random effects
        qualifier = 'was included' if len(effects['main']) == 1 else 'were included'
        #start of statement
        results.append("For our model, ")
        #for each effect
        for idx, effect in enumerate(effects['main']):
            #if single item
            if len(effects['main']) == 1:
                results.append(" %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
            #multiple items
            else:
                #last item
                if idx + 1 == len(effects['main']):
                    results.append("and %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
                else:
                    results.append("%s,"%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
        #end  
        results.append("%s as main effects."%(qualifier))
    
    #list of random effects
    #stimulus (within subjects; %s, %s),"%(def_['iaps'], def_['pofa']),
    if effects['random'] != None:
        #create qualifier #check if single random effect or multiple random effects
        qualifier = 'was included' if len(effects['random']) == 1 else 'were included'
        #start of statement
        results.append("Random effects for")
        #for each effect
        for idx, effect in enumerate(effects['random']):
            #if single item
            if len(effects['random']) == 1:
                results.append(" %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
            #multiple items
            else:
                #last item
                if idx + 1 == len(effects['random']):
                    results.append("and %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
                else:
                    results.append("%s,"%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
        #end
        results.append('%s in the model to account for their respective variation in their slopes and intercepts.'%(qualifier))
    
    #outcome variable
    results.append('%s was used as the outcome measure.</p><p>'%(popover(name=long_[y],title=long_[y],description=def_[y])))
                 
    # results
    results.append('For our analysis of %s, our results revealed'%(short_[y]))
    for idx, effect in enumerate(effects['main']):
        #get row
        row = df_anova[df_anova['term'].str.contains(effect)]
        #get term, B, se, z, p # lmer
        name = effect
        N_ = int(row['N'].values[0])
        DF_ = int(row['DF'].values[0])
        f_ = stn(row['f'].values[0])
        p_ = stn(row['Pr(>|f|)'].values[0])
        #significance
        if float(p_) <= 0.05:
            results.append('a statistically significant effect of')
        else:
            results.append('no statistically significant effect of')
        #if single item
        if len(effects['main']) == 1:
            results.append('%s (<a><i>F</i> (%s, %s) = %s, <i>p</i> = %s</a>).'%(short_[name], N_, DF_, f_, p_))
        #multiple items
        else:
            #last item
            if idx + 1 == len(effects['main']):
                results.append('%s (<a><i>F</i> (%s, %s) = %s, <i>p</i> = %s</a>).'%(short_[name], N_, DF_, f_, p_))
            else:
                results.append('%s (<a><i>F</i> (%s, %s) = %s, <i>p</i> = %s</a>),'%(short_[name], N_, DF_, f_, p_))
    ## build results            
    results = ' '.join(results)
    
    # combine
    footnote = summary + re.sub(r'\s+', ' ', description + results).strip()
    
    #----create script
    script = ['<div class="code-container" style="display: none">' + '\n',
                '<div class="button-bar">'+'\n',
                    '<a href="#" class="btn code hidden" source="copy" role="button">Copy</a>',
                    '<a href="#" class="btn code hidden" source="download" role="button">Download</a>',
                '</div>'+'\n',
                '<pre class="line-numbers">' + '\n',
                    #'<code id="editor" class="lang-r">'+'\n', #tinymce
                    '<code contenteditable="true" class="lang-r" name=%s>'%(var_[y]) +'\n', #prismjs
                        '%s'%(get_anova) + '\n',
                    '</code>' + '\n',
                '</pre>' + '\n',
            '</div>\n']
    script = ''.join(script)
    
    #----plots and tables
    #lsmeans table
    #https://cran.r-project.org/web/packages/afex/vignettes/afex_anova_example.html
    try:         
        lsmeans = lsmeans.drop(['rhs'], 1)
        lsmeans['p.value'] = lsmeans[['p.value']].apply(lambda x: x.dropna().round(4).astype(str))
        lsmeans = lsmeans.rename_axis("index", axis="columns")
        lsmeans = lsmeans.rename(columns={'lhs':'Contrasts','std.error':'SE','statistic':'t','p.value':'Pr(>|t|)'})
    except:
        pass
    #create
    html_plots = []
    title_ = '<b>Table 2.</b> Pairwise Comparisons of Estimated Marginal Means.'
    footnote_ = def_['emm']
    html_plots.append({"title":title_,"footnote":footnote_,"anchor":"lsmean",'type':'table','df':lsmeans})

    # for each main effect (if categorical)
    for idx, effect in enumerate(effects['main']):
        # if categorical variable and not trialType
        if (effects['main'][effect] == 'categorical') and (effect != 'trialType'):
            #boxplot
            file = "%s_%s_boxplot.png"%(y, effect)
            path_ = path + "/img/" + file
            title_="Pairwise Comparisons Plot."
            footnote_ = 'This boxplot provides a comparison of %s and across both %s and %s stimuli. \
            Both IAPS and POFA plots were nornalized to allow direct comparison.'\
            %(short_[effect],
              popover(name=short_['iaps'], title=long_['iaps'], description=cite_['iaps'], image=img_['iaps']),
              popover(name=short_['pofa'], title=long_['pofa'], description=cite_['pofa'], image=img_['iaps']))
            # append and draw
            html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"boxplot", 'type':'plot'})
            plot.boxplot(config=config, df=residuals, path=path_, x=effect, y=y, cat='bias')
    
    #probability (QQ) plot
    file = "%s_qq.png"%(y)
    path_ = path + "/img/" + file
    title_="Q-Q Plot (<a class='cat iaps'>iaps</a>, <a class='cat pofa'>pofa</a>)."
    footnote_ = def_['qq']
    html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"qq", 'type':'plot'})
    plot.qq_plot(config=config, y=y, residuals=residuals, path=path_)
    
    #residuals vs fitted plot
    file = "%s_residuals.png"%(y)
    path_ = path + "/img/" + file
    title_="Residuals vs Fitted Plot (<a class='cat iaps'>iaps</a>, <a class='cat pofa'>pofa</a>)."
    footnote_ = def_['rvf']
    html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"rf", 'type':'plot'})
    plot.residual_plot(config=config, y=y, residuals=residuals, path=path_)
    
    #save model and plot
    ##create html
    html = None
    if is_html:
        path_ = path + '%s.html'%(y)
        html = plot.html(config=config, df=df_anova, raw_data=df, path=path_, source=source, title=title, name=y, script=script, 
                  plots=html_plots, footnote=footnote, var=var_[y], short=short_[y], long=long_[y])
    #----end
    console('%s finished in %s msec'%(_f,((datetime.datetime.now()-_t0).total_seconds()*1000)), 'blue')
    return model, df_anova, get_anova, html
 
[docs]def lmer(config, y, f, df, exclude, csv, path, effects, is_html=True):
    """Run linear mixed regression model, using rpy2, seaborn and pandas.

    Parameters
    ----------
    y : :obj:`str`
        Response variable.
    f : :obj:`list` of :obj:`str`
        Formula to use for analysis.
    df : :class:`pandas.core.frame.DataFrame`
        Pandas dataframe of raw data.
    f : :obj:`str`
        R-compatiable formula.
    exclude : :obj:`list`
        List of participants to be excluded.
    csv : :obj:`str`
        Name of generated CSV file to run analysis in R.
    path : :obj:`str`
        The directory path to save the generated files.
    effects : :obj:`list`
        List of main effects.
    is_html : :obj:`bool`
        Whether html should be generated.
        
    Returns
    -------
    model : :class:`rpy2.robjects.methods.RS4`
        Python representation of an R instance of class 'S4'.
    df_lmer : :class:`pandas.core.frame.DataFrame`
        Pandas dataframe of model output.
    get_lmer : :class:`str`
        R script to run model.
    html : :class:`str
        Html output.
        
    Notes
    -----
    **Resources**
        - https://rpsychologist.com/r-guide-longitudinal-lme-lmer
        - https://stackoverflow.com/questions/47686227/poisson-regression-in-statsmodels-and-r
        - https://tsmatz.wordpress.com/2017/08/30/glm-regression-logistic-poisson-gaussian-gamma-tutorial-with-r/
        - https://stats.stackexchange.com/questions/311556/help-interpreting-count-data-glmm-using-lme4-glmer-and-glmer-nb-negative-binom
    """
    from rpy2.robjects import pandas2ri, r
    pandas2ri.activate()

    #----for timestamp
    _t0 = datetime.datetime.now()
    _f = debug(message='t', source="timestamp")
    console('model.lmer(%s)'%(y), 'green')
    
    #----metadata
    source = 'onset'
    
    #-----exclude participants
    df_ex = df[~df['participant'].isin(exclude)]
    
    #----check if paths exist
    for path_ in [path, path + "csv/", path+"/img/"]:
        if not os.path.exists(path_):
            os.makedirs(path_)
    
    #----save data for access by R
    df_ex.to_csv(path + "csv/" + csv, index=None)
        
    #----get number of remaining subjects (datapoints) used
    subject_poisson = df_ex.drop_duplicates(subset="participant", keep="first").shape[0]
    
    #----model
    model_ = 'lmerTest::lmer(%s, data=df)'%(f),
    
    #----run r
    #code
    get_lmer = '\n'.join([
        '#!/usr/bin/env Rscript3.5.1',
        'rm(list=ls());',
        '#----library',
        '# core',
        'library(tidyverse);',
        'library(broom); library(broom.mixed);',
        '# analysis',
        'library(lme4); library(lmerTest);',
        '# plot',
        'library(ggplot2);',
        '# estimated marginal means',
        'library(emmeans); library(multcomp);',
        '\t' + '',
        "#----linear mixed model", 
        'lmer_ <- function(){',
        '\t' + "#----load data",
        '\t' + 'path <- "%s"'%(path + "csv/"),
        '\t' + "df <- read.csv(file=file.path(path, '%s'), header=TRUE)"%(csv),
        '\t' + '',
        '\t' + '#----drop data',
        '\t' + '# os #samples too small',
        '\t' + 'df <- df[(!df$os=="cos"),]',
        '\t' + '',
        '\t' + '#----normalize trial to [0,1] (recommended by Jason)',
        '\t' + 'df$TrialNum <- lapply(df$TrialNum, function(x){((x - 0)/(197 - 0))})',
        '\t' + '',
        '\t' + '#----set type', 
        '\t' + '# set as factor',
        '\t' + "df$os <- factor(df$os)",
        '\t' + "df$trialType <- factor(df$trialType)",
        '\t' + 'df$participant <- factor(df$participant)',
        '\t' + '# set trial as numeric (recommended by Jason)',
        '\t' + 'df$TrialNum <- as.numeric(df$TrialNum)',
        '\t' + '',      
        '\t' + '#----run model',
        '\t' + 'model <- %s'%(model_),
        '\t' + '',
        '\t' + '#----getting x,y coordinates for qqplot',
        '\t' + '# create plot',
        '\t' + 'gg <- ggplot(model) + ',
        '\t' + '  ggplot2::stat_qq(aes(sample = .resid, colour = factor(trialType))) + ',
        '\t' + '  ggplot2::geom_abline(linetype = "dotted") + ',
        '\t' + '  theme_bw()',
        '\t' + '# convert to tibble',
        '\t' + 'gg <- ggplot_build(gg)[["data"]][[1]] %>% ',
        '\t' + '  dplyr::select(sample, theoretical)',
        '\t' + '',
        '\t' + '#----creating residuals tibbles from model',
        '\t' + '# including raw data, residuals vs fitted',
        '\t' + '# .resid=residuals, .fitted=predicted values .estimate=estimate of fixed effect',
        '\t' + 'residuals <- broom::augment(model) %>% ',
        '\t' + '   dplyr::select(%s, .resid, .fitted)'%('participant' + ', sqrt.' + y + '., ' + ', '.join(effects['fixed'].keys())),
        '\t' + '',
        '\t' + '#----merge residuals and qq tibbles',
        '\t' + 'residuals <- merge(x=residuals, y=gg, by.x=".resid", by.y="sample", all.x = TRUE)',
        '\t' + '',
        '\t' + '#----prepare data for export',
        '\t' + '# convert output to tibble',
        '\t' + 'output <- broom.mixed::tidy(model)',
        '\t' + '# convert summary to tibble',
        '\t' + 'summary <- broom.mixed::glance(model)',
        '\t' + '',
        '\t' + '#----return model output, model, residuals, model summary',
        '\t' + 'return(list(output, model, residuals, summary))',
    '}'])
    #load r function
    lmer_r = r(get_lmer)
    #run
    df_r = lmer_r()
    
    #----extract df, model, summary from rpy2
    df_lmer = df_r[0]
    model = df_r[1]
    residuals = df_r[2]
    summary = df_r[3]
    
    #----clean data
    # drop row if it contains intercepts
    drop = ['sd__(Intercept)','	sd__TrialNum','cor__(Intercept).TrialNum','sd__Observation','sd__TrialNum']
    df_lmer = df_lmer[~df_lmer['term'].isin(drop)]
    #rename
    df_lmer = df_lmer.rename(columns={'estimate':'B','std.error':'SE','statistic':'t','p.value':'Pr(>|t|)'})
    # round 
    ## B
    df_lmer['B'] = df_lmer[['B']].apply(lambda x: x.dropna().round(4).astype(str))
    ## SE
    df_lmer['SE'] = df_lmer[['SE']].apply(lambda x: x.dropna().round(4).astype(str))
    ## t
    df_lmer['t'] = df_lmer[['t']].apply(lambda x: x.dropna().round(4).astype(str))
    ## p-value
    df_lmer['Pr(>|t|)'] = df_lmer[['Pr(>|t|)']].apply(lambda x: x.dropna().round(4).astype(str))
    #rename columns
    df_lmer = df_lmer.rename_axis("index", axis="columns")
    # drop column
    df_lmer = df_lmer.drop(['group','effect'], 1)
    
    #----format summary
    summary = summary.rename_axis("index", axis="columns")
    summary = summary.to_html(index=True, index_names=True).replace('<table border="1" class="dataframe">',
    '<table id="table2" class="table '+source+' table-striped table-bordered hover dt-responsive nowrap"\ cellspacing="0" width="100%">')
    
    #---------title, footnote, results
    short_ = config['metadata']['short']
    long_ = config['metadata']['long']
    def_ = config['metadata']['def']
    cite_ = config['metadata']['cite']
    url_ = config['metadata']['url']
    var_ = config['metadata']['var']
    
    #----title, description, and results
    title = '<b>Table 1.</b> Linear Mixed Model Regression for %s (<i>N</i> = %s).'%(long_[y], subject_poisson)
    description = ''.join([
        "<p><b>Linear Mixed Model Fit by REML (Laplace Approximation)</b> \
        [%a]. "%(link(name='lmer', url='https://www.rdocumentation.org/packages/lme4/versions/1.1-21/topics/lmer')),
        "This table summarizes effects on onset error rate with trial number, operating system, and stimulus.</p>",
        "<div class='paragraph'>",
            "<div>The assumptions for the model are:</div>",
            '<ul class="number-list">',
                '<li>%s (%s).</li>'
                %(popover(name=long_['rnd'],title=long_['rnd'],description=def_['rnd']),
                  link(name='Q-Q Plot', url='#qq')),
                '<li>%s (%s).</li>'
                %(popover(name=long_['hv'],title=long_['hv'],description=def_['hv']),
                  link(name='Residual vs Fitted Plot', url='#rf')),
                '<li>%s</li>'
                %(popover(name=long_['io'],title=long_['io'],description=def_['io'])),
            '</ul>',
        '</div>'
        "<div class='paragraph'>",
            "<div>The following post-hoc analysis were run:</div>",
            '<ul class="number-list">',
                '<li>%s.</li>'
                %(link(name='Individual Trend Plot', url='#itl')),
                '<li>%s.</li>'
                %(link(name='Group Trend Plot', url='#gtl')),
                '<li>%s.</li>'
                %(link(name='Q-Q Plot', url='#qq')),
                "<li>%s.</li>"
                %(link(name='Residual vs Fitted Plot', url='#rf')),
            '</ul>',
        '</div>'
    ])
    
    #----results
    '''For statistical analysis, models were conducted with R, version 3.3.2 ([R]), using the <i>lme4</i> package [lmer]. To \
    analyze [y] we fit linear mixed-effects models using the <i>lmer</i> function. For our model, Group (NHI vs. PWA), Ambiguity 
    (ambiguous vs. unambiguous) and Context (DO-bias vs. SC-bias) were entered as fixed effects. Random intercepts and slopes 
    by participants and items were included for all fixed effects, as it was expected that participants and items would be 
    differently affected by the experimental manipulation. '''
    results = [
        '<div class="subtitle">Statistical Analysis</div>',
        "<p>",
        def_['exclude'],
        'We employed linear mixed effects models with random intercepts and slopes using the lmer() function in the <i>lme4</i> R package \
        (%s; %s).'%(\
        popover(name=long_['R'], title=long_['R'], description=cite_['R']),
        popover(name=long_['lmer'], title=long_['lmer'], description=cite_['lmer']))
    ]
    
    #list of fixed effects    
    #"The fixed effects was %s,"%(def_['cesd_group'])
    if effects['fixed'] != None:
        #create qualifier #check if single random effect or multiple random effects
        qualifier = 'was included' if len(effects['fixed']) == 1 else 'were included'
        #start of statement
        results.append("For our model, ")
        #for each effect
        for idx, effect in enumerate(effects['fixed']):
            #if single item
            if len(effects['fixed']) == 1:
                results.append(" %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
            #multiple items
            else:
                #last item
                if idx + 1 == len(effects['fixed']):
                    results.append("and %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
                else:
                    results.append("%s,"%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
        #end  
        results.append("%s as fixed effects."%(qualifier))
    
    #list of random effects
    #stimulus (within subjects; %s, %s),"%(def_['iaps'], def_['pofa']),
    if effects['random'] != None:
        #create qualifier #check if single random effect or multiple random effects
        qualifier = 'was included' if len(effects['random']) == 1 else 'were included'
        #start of statement
        results.append("Random effects for")
        #for each effect
        for idx, effect in enumerate(effects['random']):
            #if single item
            if len(effects['random']) == 1:
                results.append(" %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
            #multiple items
            else:
                #last item
                if idx + 1 == len(effects['random']):
                    results.append("and %s."%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
                else:
                    results.append("%s,"%(popover(name=long_[effect],title=long_[effect],description=def_[effect])))
        #end
        results.append('%s in the model to account for their respective variation in their slopes and intercepts.'%(qualifier))
    
    #outcome variable
    results.append('%s was used as the outcome measure.</p><p>'%(popover(name=long_[y],title=long_[y],description=def_[y])))

    # results
    results.append('For our analysis of %s, our results revealed'%(short_[y]))
    for idx, effect in enumerate(effects['fixed']):
        #get row
        row = df_lmer[df_lmer['term'].str.contains(effect)]
        #get term, B, se, z, p # lmer
        name = effect
        B_ = row['B'].values[0]
        se_ = row['SE'].values[0]
        p_ = row['Pr(>|t|)'].values[0]
        #significance
        if float(p_) <= 0.05:
            results.append('a statistically significant effect of')
        else:
            results.append('no statistically significant effect of')
        #if single item
        if len(effects['fixed']) == 1:
            results.append('%s (<a><i>t</i> = %s, <i>SE</i> = %s, <i>p</i> = %s</a>).'%(short_[name], B_, se_, p_))
        #multiple items
        else:
            #last item
            if idx + 1 == len(effects['fixed']):
                results.append('%s (<a><i>t</i> = %s, <i>SE</i> = %s, <i>p</i> = %s</a>).'%(short_[name], B_, se_, p_))
            else:
                results.append('%s (<a><i>t</i> = %s, <i>SE</i> = %s, <i>p</i> = %s</a>),'%(short_[name], B_, se_, p_))
                
    ## build results            
    results = ' '.join(results)
    
    # combine
    footnote = summary + re.sub(r'\s+', ' ', description + results).strip()
    
    #---------plots
    #build plots
    plots = {}
    
    #---individual trend line
    clip = 250 if y=='diff_dotloc' else 200
    plots['individual'] = {}
    plots['individual']['type'] = 'plot'
    plots['individual']['file'] = '%s_individual'%(y)
    plots['individual']['path'] = path + "/img/" + "%s.png"%(plots['individual']['file'])
    plots['individual']['title'] = "Individual Trend Plot of the Difference Between Expected and True Onset Time \
    for %s (nested by subject:trial, <i>window</i> = 5)."%(long_[y])
    # footnote
    _exc = len(config['metadata']['subjects']['exclude'])
    _pct = (round(len(config['metadata']['subjects']['exclude'])/len(config['metadata']['subjects']['eyetracking']), 4)*100)
    plots['individual']['footnote'] = "Each line represents a participants individual %s across all trials. Participants with \
    Dotloc or Stimulus Onset Error median above 3SD (<i>n</i> = %s, %.1f%%) are drawn with a semi-opaque line. \
    The graph has been clipped at <i>y</i> = %s for displaying purposes."%(short_[y], _exc, _pct, clip)
    #---group trend plot
    plots['group'] = {}
    #---binned
    plots['group']["binned"] = {"bins":33,"ptype":'diff'}
    #---unbinned
    plots['group']["unbinned"] = {"bins":None,"ptype":'diff'}
    #both
    plots['group']['type'] = 'plot'
    plots['group']['file'] = '%s_group'%(y)
    plots['group']['path'] = path + "/img/" + "%s.png"%(plots['group']['file'])
    plots['group']['title'] = "Trend Plot of the Difference Between Expected and True Onset Time \
    for %s (nested by subject:trial)."%(long_[y])
    plots['group']['footnote'] = "Data is either unbinned (c,d) or binned into %s discrete evenly-sized groups (a,b). \
    The model is still fit using the original data. No participants have been excluded for this analysis. \
    The binned graph has been clipped at <i>y</i> = 1000 for displaying purposes."%(plots['group']["binned"]['bins'])
    # run
    #rename
    df = df.rename(columns={'onset>500':'onset_greater'})
    html_plots = plot.onset_diff_plot(config=config, df=df, meta=plots, drop=exclude, y=y, clip=clip)
    
    #---probability (QQ) plot
    file = "%s_qq.png"%(y)
    path_ = path + "/img/" + file
    title_="Q-Q Plot (<a class='cat iaps'>iaps</a>, <a class='cat pofa'>pofa</a>)."
    footnote_ = def_['qq']
    html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"qq", 'type':'plot'})
    plot.qq_plot(config=config, y=y, residuals=residuals, path=path_)
    
    #---residuals vs fitted plot
    file = "%s_residuals.png"%(y)
    path_ = path + "/img/" + file
    title_="Residuals vs Fitted Plot (<a class='cat iaps'>iaps</a>, <a class='cat pofa'>pofa</a>)."
    footnote_ = def_['rvf']
    html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"rf", 'type':'plot'})
    plot.residual_plot(config=config, y=y, residuals=residuals, path=path_)

    #---create script
    script = ['<div class="code-container" style="display: none">'+'\n',
                '<div class="button-bar">'+'\n',    
                    '<a href="#" class="btn code hidden" source="copy" role="button">Copy</a>',
                    '<a href="#" class="btn code hidden" source="download" role="button">Download</a>',
                '</div>'+'\n',
                '<pre class="line-numbers">'+'\n',
                    #'<code id="editor" class="lang-r">'+'\n', #tinymce
                    '<code contenteditable="true" class="lang-r" name=%s>'%(var_[y]) +'\n', #prismjs
                       '%s\n'%(get_lmer),
                    '</code>'+'\n',
                '</pre>'+'\n',         
            '</div>'+'\n']
    script = ''.join(script)
    
    #----html
    html = None
    if is_html:
        html_name = '%s_error'%(y)
        path_ = path + "%s.html"%(html_name)
        html = plot.html(config=config, df=df_lmer, raw_data=df, path=path_, source=source, title=title, name=html_name,
                  script=script, plots=html_plots, footnote=footnote, var=var_[y], short=short_[y], long=long_[y])
    #----end
    console('%s finished in %s msec'%(_f,((datetime.datetime.now()-_t0).total_seconds()*1000)), 'blue')
    return model, df_lmer, get_lmer, html

[docs]def logistic(config, y, f, df, me, exclude, csv, path, is_html=True):
    """Run logistic regression model, using rpy2, seaborn and pandas.

    Parameters
    ----------
    y : :obj:`str`
        Response variable.
    f : :obj:`str`
        Formula to use for analysis.
    df : :class:`pandas.core.frame.DataFrame`
        Pandas dataframe of raw data.
    me : :obj:`list` of :obj:`str`
        List of main effects.
    exclude : :obj:`list`
        List of participants to be excluded.
    csv : :obj:`str`
        Name of generated CSV file to run analysis in R.
    path : :obj:`str`
        The directory path to save the generated files.
    is_html : :obj:`bool`
        Whether html should be generated.
        
    Returns
    -------
    model : :class:`rpy2.robjects.methods.RS4`
        Python representation of an R instance of class 'S4'.
    df_logit : :class:`pandas.core.frame.DataFrame`
        Pandas dataframe of model output.
    get_logit : :class:`str`
        R script to run model.
    html : :class:`str
        Html output.
        
    Notes
    -----
    **Resources**
        - https://rpsychologist.com/r-guide-longitudinal-lme-lmer
        - https://stats.idre.ucla.edu/r/dae/mixed-effects-logistic-regression/
        - https://www.statisticssolutions.com/assumptions-of-logistic-regression/
    """
    from rpy2.robjects import pandas2ri, r
    pandas2ri.activate()

    #----for timestamp
    _t0 = datetime.datetime.now()
    _f = debug(message='t', source="timestamp")
    console('model.logistic(%s)'%(y), 'green')
    
    #----metadata
    source = 'logit'
    #get fullname
    full = y.replace("_", " ").title().replace("Cesd", "CESD")
    
    #-----exclude participants
    df = df[~df['participant'].isin(exclude)]
    
    #---------check if paths exist
    for path_ in [path, path+"/csv/", path+"/img/"]:
        if not os.path.exists(path_):
            os.makedirs(path_)
    
    #---------save data for access by R
    df.to_csv(path + "/csv/" + csv, index=None)
    
    #-------get number of remaining subjects (datapoints) used
    subjects_logit = df.drop_duplicates(subset="participant", keep="first").shape[0]
    
    #-------model
    #f = 'factor(cesd_group) ~ dp_bias + gaze_bias + final_gaze_bias + (1|participant)'
    # f = 'glmer(%s, \n\
    #            family=binomial(link="logit"), data=df, nAGQ=0)'%(f)
    f = 'lmer(%s, data=df)'%(f)
    
    #-------logit and confidence intervals to R data.frame
    ##note: confidence intervals are based on the profiled log-likelihood function
    get_logit = '\n'.join([
        '#!/usr/bin/env Rscript3.5.1',
        'rm(list=ls());',
        '#----library',
        '# core',
        'library(tidyverse);',
        'library(broom);',
        '# analysis',
        'library(lme4);',
        '# plot',
        'library(ggplot2);',
        '# estimated marginal means',
        'library(emmeans); library(multcomp);',
        '\t' + '',
        '#----generalized linear mixed model',
        'logit_ <- function(){',
        '\t' + "#----load data",
        '\t' + 'path <- "%s"'%(path + "csv/"),
        '\t' + "df <- read.csv(file=file.path(path, '%s'), header=TRUE)"%(csv),
        '\t' + '',
        '\t' + '#----normalize trial to [0,1] (recommended by Jason)',
        '\t' + 'df$TrialNum <- lapply(df$TrialNum, function(x){((x - 0)/(197 - 0))})',
        '\t' + '',
        '\t' + '#----set type', 
        '\t' + '# set as factor',
        '\t' + "df$os <- factor(df$os)",
        '\t' + "df$trialType <- factor(df$trialType)",
        '\t' + 'df$participant <- factor(df$participant)',
        '\t' + '# set trial as numeric (recommended by Jason)',
        '\t' + 'df$TrialNum <- as.numeric(df$TrialNum)',
        '\t' + '',
        '\t' + '#----run model',
        '\t' + 'model <- %s'%(f),
        '\t' + '',
        '\t' + '#----odds ratio and confidence interval',
        '\t' + 'or_ci <- exp(cbind(OR=fixef(model), confint(model, parm="beta_", method="Wald"))) %>%',
        '\t' + '  cbind(term = rownames(.), .)',
        '\t' + 'rownames(or_ci) <- 1:nrow(or_ci)',
        '\t' + '',
        '\t' + '#----getting x,y coordinates for qqplot',
        '\t' + '# create plot',
        '\t' + 'gg <- ggplot(model) + ',
        '\t' + '  ggplot2::stat_qq(aes(sample = .resid, colour = factor(trialType))) + ',
        '\t' + '  ggplot2::geom_abline(linetype = "dotted") + ',
        '\t' + '  theme_bw()',
        '\t' + '# convert to tibble',
        '\t' + 'gg <- ggplot_build(gg)[["data"]][[1]] %>% ',
        '\t' + '  dplyr::select(sample, theoretical)',
        '\t' + '',
        '\t' + '#----creating residuals tibbles from model',
        '\t' + '# including raw data, residuals vs fitted',
        '\t' + '# .resid=residuals, .fitted=predicted values .estimate=estimate of fixed effect',
        '\t' + 'residuals <- broom::augment(model) %>% ',
        '\t' + '   dplyr::select(participant, trialType, cesd_group, .resid, .fitted)',
        '\t' + '',
        '\t' + '#----merge residuals and qq tibbles',
        '\t' + 'residuals <- merge(x=residuals, y=gg, by.x=".resid", by.y="sample", all.x = TRUE)',
        '\t' + '',
        '\t' + '#----prepare data for export',
        '\t' + '# convert model to tibble',
        '\t' + 'output <- broom::tidy(model)',
        '\t' + '# convert summary to tibble',
        '\t' + 'summary <- broom::glance(model)',
        '\t' + '',
        '\t' + '#----merge output with odds ratio and confidence interval',
        '\t' + 'output <- merge(x=output, y=or_ci, by.x="term", by.y="term", all.x = TRUE, sort = FALSE)',
        '\t' + '',
        '\t' + '#----return model output, model, residuals, model summary',
        '\t' + 'return(list(output, model, residuals, summary))',
    '}'])
    #----run r code
    #load r function
    logit_r = r(get_logit)
    #run
    df_r = logit_r()
    
    #----extract model dataframe, model, and residuals dataframe from rpy2
    df_logit = df_r[0]
    model = df_r[1]
    residuals = df_r[2]
    summary = df_r[3]
    
    #----clean data
    #rename
    df_logit = df_logit.rename(columns={'estimate':'B','std.error':'SE','statistic':'z','p.value':'Pr(>|z|)','97.5 %':'97.5%','2.5 %':'2.5%'})
    # round p-value
    df_logit['Pr(>|z|)'] = df_logit[['Pr(>|z|)']].apply(lambda x: x.dropna().round(4).astype(str))
    #merge ci
    df_logit['95% CI'] = df_logit[['2.5%', '97.5%']].values.tolist()
    # drop column
    df_logit = df_logit.drop(['group','effect','2.5%', '97.5%'], 1)
    # drop row if it contains intercepts
    # drop = ['sd_(Intercept).participant','sd_TrialNum.participant','cor_(Intercept).TrialNum.participant',
    #         'sd__(Intercept)','sd__TrialNum','cor__(Intercept).TrialNum']
    # df_logit = df_logit[~df_logit['term'].isin(drop)]
    
    #rename columns
    df_logit = df_logit.rename_axis("index", axis="columns")
    
    
    #----format summary
    summary = summary.rename_axis("index", axis="columns")
    summary = summary.to_html(index=True, index_names=True).replace('<table border="1" class="dataframe">',
    '<table id="table2" class="table '+source+' table-striped table-bordered hover dt-responsive nowrap"\ cellspacing="0" width="100%">')
    
    #---------title, footnote, results
    short_ = config['metadata']['short']
    long_ = config['metadata']['long']
    ref_ = config['metadata']['ref']
    def_ = config['metadata']['def']
    

    #-------title, footnote, and results
    title = '<b>Table 1.</b> Generalized Linear Mixed Model Regression for %s (N = %s).'%(full, subjects_logit)
    #description
    description = ''.join([
        "<p><b>Generalized Linear Mixed Model Fit by Maximum Likelihood (Adaptive Gauss-Hermite Quadrature)</b> \
        [<a class='anchor', href='https://www.rdocumentation.org/packages/lme4/versions/1.1-21/topics/glmer'>glmer</a>]. ",
        "This table summarizes effects on CESD score with trial number, bias score, and stimulus.</p>",
        "<div class='paragraph'>",
            "<div>The assumptions for the model are:</div>",
            '<ul class="number-list">',
                '<li>%s (<a href="#qq" class="anchor">Q-Q Plot</a>).</li>'%(def_['nd']),
                "<li>%s (<a href='#rf' class='anchor'>Residual vs Fitted Plot</a>).</li>"%(def_['hv']),
                '<li>%s </li>'%(def_['io']),
            '</ul>',
        '</div>'
        "<div class='paragraph'>",
            "<div>The following post-hoc analysis were run:</div>",
            '<ul class="number-list">',
                '<li><a href="#qq" class="anchor">Q-Q Plot</a>.</li>',
                "<li><a href='#rf' class='anchor'>Residual vs Fitted Plot</a>.</li>",
            '</ul>',
        '</div>'
    ])
    #results
    terms = ['bias score','stimulus','trial number']
    results = [
        "The resulting data were analysed by fitting mixed effects logistic regression models in 'R', using the glmer \
        function (%s)."%(def_['lmer']),
        "The dependent variable was a binary measure of CESD score ('Low' (<16) and 'High' (≥16))."
        "The random effects were: trial and particpant.",
        "The fixed effects were: %s score (within subjects),"%(me[0].replace("_", " ")),
        "trial (within subjects; 0-197),",
        "and stimulus (within subjects; %s, %s)."%(def_['iaps'], def_['pofa']),
        "Weights were applied to the model to correct for the validity of bias score per trial."
    ]
    ## main effects
    for idx, effect in enumerate(me):
        #get row
        row = df_logit[df_logit['term'].str.contains(effect)]
        #get term, B, se, z, p
        term = terms[idx]
        or_ = stn(row['OR'].values[0])
        ci_ = '%s, %s'%(stn(row['95% CI'].values[0][0]),stn(row['95% CI'].values[0][1]))
        p_ = stn(row['Pr(>|z|)'].values[0])
        #append
        #if first item
        results.append('From our results, task %s did not predict the magnitude of CESD score &beta;=%s, 95%% CI[%s], p=%s.'%(term,or_,ci_,p_))
            
    results = '<p>' + ' '.join(results) + '</p>'
    
    #combine all
    footnote = re.sub(r'\s+', ' ', summary + description + results).strip()
    
    #----prepare script for html
    script = ['<div class="code-container" style="display: none">'+'\n',
                '<pre class="line-numbers">'+'\n',
                    '<code class="lang-r">'+'\n',
                       '%s\n'%(get_logit),
                    '</code>'+'\n',
                '</pre>'+'\n',
            '</div>'+'\n']
    script = ''.join(script)
    
    #----allows seaborn to be run if matplotlib has already been loaded
    os.environ['KMP_DUPLICATE_LIB_OK']='True'
    
    #----plots
    html_plots = []
    #probability (QQ) plot
    file = "%s_qq.png"%(y)
    path_ = path + "/img/" + file
    title_="Q-Q Plot for CESD Group (<a class='cat iaps'>iaps</a>, <a class='cat pofa'>pofa</a>)."
    footnote_ = def_['qq']
    html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"qq"})
    plot.qq_plot_r(config=config, y=y, residuals=residuals, path=path_)
    
    #residuals vs fitted plot
    file = "%s_residuals.png"%(y)
    path_ = path + "/img/" + file
    title_="Residuals vs Fitted for CESD Group (<a class='cat iaps'>iaps</a>, <a class='cat pofa'>pofa</a>)."
    footnote_ = def_['rvf']
    html_plots.append({"title":title_,"file":"%s"%(file),"footnote":footnote_, "anchor":"rf"})
    plot.residual_plot_r(config=config, y=y, residuals=residuals, path=path_)
        
    #----save model and plot
    #save model and plot
    ##create html
    html = None
    if is_html:
        path_ = path + '/%s.html'%(y)
        html = plot.html(config=config, df=df_logit, raw_data=df, path=path_, source=source, plots=html_plots, name=y, 
                  title=title, footnote=footnote, script=script)        
    #----end
    console('%s finished in %s msec'%(_f,((datetime.datetime.now()-_t0).total_seconds()*1000)), 'blue')
    return model, df_logit, get_logit, html