From 70f7cb8891c7a4eee3fb096198b42b74460f28bf Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Wed, 20 Apr 2022 13:24:41 -0400
Subject: [PATCH 01/14] adding gsheet support

---
 wranglertools/import_data.py | 190 +++++++++++++++++++++++------------
 1 file changed, 127 insertions(+), 63 deletions(-)

diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index c0971119..e41fc777 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -9,6 +9,7 @@
     create_common_arg_parser, _remove_all_from_types)
 from dcicutils import ff_utils
 import openpyxl
+import gspread
 import datetime
 import sys
 import mimetypes
@@ -137,6 +138,23 @@ def getArgs():  # pragma: no cover
 ]
 
 
+ALLOWED_MIMES = (
+    'application/pdf',
+    'application/zip',
+    'text/plain',
+    'text/tab-separated-values',
+    'text/html',
+    'application/msword',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/vnd.ms-excel',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/tiff',
+)
+
+
 def md5(path):
     md5sum = hashlib.md5()
     with open(path, 'rb') as f:
@@ -152,26 +170,27 @@ class WebFetchException(Exception):
     pass
 
 
+def mime_allowed(path, ok_mimes):
+    filename = pp.PurePath(path).name
+    guessed_mime = mimetypes.guess_type(path)[0]
+    detected_mime = magic.from_file(path, mime=True)
+    if guessed_mime not in ok_mimes:
+        print("Unallowed file type for %s" % filename)
+        return False
+    # NOTE: this whole guesssing and detecting bit falls apart for zip files which seems a bit dodgy
+    # some .zip files are detected as generic application/octet-stream but don't see a good way to verify
+    # basically relying on extension with a little verification by magic for most file types
+    if detected_mime != guessed_mime and guessed_mime != 'application/zip':
+        print('Wrong extension for %s: %s' % (detected_mime, filename))
+        return False
+    return guessed_mime
+
+
 def attachment(path):
     """Create an attachment upload object from a filename and embed the attachment as a data url.
        NOTE: a url or ftp can be used but path must end in filename with extension that will match
        the magic detected MIME type of that file and be one of the allowed mime types
     """
-    ALLOWED_MIMES = (
-        'application/pdf',
-        'application/zip',
-        'text/plain',
-        'text/tab-separated-values',
-        'text/html',
-        'application/msword',
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-        'application/vnd.ms-excel',
-        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-        'image/png',
-        'image/jpeg',
-        'image/gif',
-        'image/tiff',
-    )
     ftp_attach = False
     if not pp.Path(path).is_file():
         # if the path does not exist, check if it works as a URL
@@ -207,15 +226,9 @@ def attachment(path):
 
     attach = {}
     filename = pp.PurePath(path).name
-    guessed_mime = mimetypes.guess_type(path)[0]
-    detected_mime = magic.from_file(path, mime=True)
-    # NOTE: this whole guesssing and detecting bit falls apart for zip files which seems a bit dodgy
-    # some .zip files are detected as generic application/octet-stream but don't see a good way to verify
-    # basically relying on extension with a little verification by magic for most file types
-    if guessed_mime not in ALLOWED_MIMES:
-        raise ValueError("Unallowed file type for %s" % filename)
-    if detected_mime != guessed_mime and guessed_mime != 'application/zip':
-        raise ValueError('Wrong extension for %s: %s' % (detected_mime, filename))
+    guessed_mime = mime_allowed(path, ALLOWED_MIMES)
+    if not guessed_mime:
+        sys.exit(1)
 
     with open(path, 'rb') as stream:
         attach = {
@@ -234,25 +247,53 @@ def digest_xlsx(filename):
     return book, sheets
 
 
-def reader(workbook, sheetname=None):
-    """Read named sheet or first and only sheet from xlsx file."""
-    if sheetname is None:
-        sheet = workbook.worksheets[0]
-    else:
-        try:
-            sheet = workbook[sheetname]
-        except Exception as e:
-            print(e)
-            print(sheetname)
-            print("ERROR: Can not find the collection sheet in excel file (openpyxl error)")
-            return
+def open_gsheets(gsid):
+    gc = gspread.service_account(filename='credentials.json')
+    wkbk = gc.open_by_key(gsid)
+    sheets = [sh.title for sh in wkbk.worksheets()]
+    return wkbk, sheets
+
+
+def get_workbook(inputname, booktype):
+    if booktype == 'excel':
+        return digest_xlsx(inputname)
+    elif booktype == 'gsheet':
+        return open_gsheets(inputname)
+
+
+def reader(workbook, sheetname=None, booktype=None):
+    """Read named sheet or first and only sheet from xlsx or google sheets file.
+        Assume excel by default - will choke if no booktype and not excel"""
+    sheet = None
+    if not booktype or booktype == 'excel':
+        if sheetname is None:
+            sheet = workbook.worksheets[0]
+        else:
+            try:
+                sheet = workbook[sheetname]
+            except Exception as e:
+                print(e)
+                print(sheetname)
+                print("ERROR: Can not find the collection sheet in excel file (openpyxl error)")
+                return
+    elif booktype == 'gsheet':
+        if sheetname is None:
+            sheet = workbook.get_worksheet(0)
+        else:
+            try:
+                sheet = workbook.worksheet(sheetname)
+            except Exception as e:
+                print(e)
+                print(sheetname)
+                print("ERROR: Can not find the collection sheet in excel file (gspread error)")
+                return
     # Generator that gets rows from excel sheet
     # NB we have a lot of empty no formatting rows added (can we get rid of that)
     # or do we need to be careful to check for the first totally emptyvalue row?
-    return row_generator(sheet)
+    return row_generator(sheet, booktype)
 
 
-def row_generator(sheet):
+def row_generator(sheet, booktype=None):
     """Generator that gets rows from excel sheet
     Note that this currently checks to see if a row is empty and if so stops
     This is needed as plain text formatting of cells is recognized as data
@@ -260,12 +301,18 @@ def row_generator(sheet):
     excel transforms - maybe this is no longer needed and therefore this function
     can be simplified - AJS 2022-04-11
     """
-    for row in sheet.rows:
-        vals = [cell_value(cell) for cell in row]
-        if not any([v for v in vals]):
-            return
-        else:
-            yield vals
+    if not booktype or booktype == 'excel':
+        for row in sheet.rows:
+            vals = [cell_value(cell) for cell in row]
+            if not any([v for v in vals]):
+                return
+            else:
+                yield vals
+    else:
+        # no formatting here assuming all are strings
+        all_vals = sheet.get_values()
+        for row in all_vals:
+            yield row
 
 
 def cell_value(cell):
@@ -1067,7 +1114,7 @@ def check_file_pairing(fastq_row):
     return _pairing_consistency_check(files, errors)
 
 
-def workbook_reader(workbook, sheet, update, connection, patchall, aliases_by_type,
+def workbook_reader(workbook, booktype, sheet, update, connection, patchall, aliases_by_type,
                     dict_patch_loadxl, dict_replicates, dict_exp_sets, novalidate, attach_fields):
     """takes an openpyxl workbook object and posts, patches or does a dry run on the data depending
     on the options passed in.
@@ -1077,7 +1124,7 @@ def workbook_reader(workbook, sheet, update, connection, patchall, aliases_by_ty
     all_aliases = [k for k in aliases_by_type]
     # dict for acumulating cycle patch data
     patch_loadxl = []
-    row = reader(workbook, sheetname=sheet)
+    row = reader(workbook, sheetname=sheet, booktype=booktype)
     skip_dryrun = False
     if sheet == "ExperimentMic_Path":
         skip_dryrun = True
@@ -1347,9 +1394,9 @@ def build_tibanna_json(keys, types, values, connection):
     return template
 
 
-def user_workflow_reader(workbook, sheet, connection):
+def user_workflow_reader(workbook, booktype, sheet, connection):
     """takes the user workflow runsheet and ony post it to fourfront endpoint."""
-    row = reader(workbook, sheetname=sheet)
+    row = reader(workbook, sheetname=sheet, booktype=booktype)
     keys = next(row)  # grab the first row of headers
     types = next(row)  # grab second row with type info
     # remove title column
@@ -1493,14 +1540,30 @@ def _verify_and_return_item(item, connection):
     return res
 
 
-def cabin_cross_check(connection, patchall, update, infile, remote, lab=None, award=None):
-    """Set of check for connection, file, dryrun, and prompt."""
-    print("Running on:       {server}".format(server=connection.key['server']))
-    # check input file (xls)
-    if not pp.Path(infile).is_file():
-        print(f"File {infile} not found!")
+def check_and_return_input_type(inputname):
+    if pp.Path(inputname).is_file():
+        xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+        # specific check for xlsx
+        if not mime_allowed(inputname, xlsx_mime):
+            print(f"ERROR: File {infile} not recognized as excel file")
+            sys.exit(1)
+        return inputname, 'excel'
+    elif inputname.startswith('http'):
+        # assume a url to google sheet and look for google?
+        if 'google' not in inputname:
+            print("ERROR: URL provided does not appear to be google sheet url")
+            sys.exit(1)
+        # parse out the bookId
+        inputname = re.search("/d/([A-Za-z0-9_-]+)/*", inputname).group(1)
+    if not re.match("^[A-Za-z0-9_-]+$", inputname):
+        print("ERROR: invalid format of the google sheet ID in input - {}".format(inputname))
         sys.exit(1)
+    return inputname, 'gsheet'
 
+
+def cabin_cross_check(connection, patchall, update, remote, lab=None, award=None):
+    """Set of check for connection, input, dryrun, and prompt."""
+    print("Running on:       {server}".format(server=connection.key['server']))
     # check for multi labs and awards and reset connection appropriately
     # if lab and/or award options used modify connection accordingly and check for conflict later
     if lab or award:
@@ -1590,7 +1653,7 @@ def get_collections(profiles):
     return supported_collections
 
 
-def get_all_aliases(workbook, sheets):
+def get_all_aliases(workbook, sheets, booktype):
     """Extracts all aliases existing in the workbook to later check object connections
        Checks for same aliases that are used for different items and gives warning."""
     aliases_by_type = {}
@@ -1598,7 +1661,7 @@ def get_all_aliases(workbook, sheets):
         if sheet == 'ExperimentMic_Path':
             continue
         alias_col = ""
-        rows = reader(workbook, sheetname=sheet)
+        rows = reader(workbook, sheetname=sheet, booktype=booktype)
         keys = next(rows)  # grab the first row of headers
         try:
             alias_col = keys.index("aliases")
@@ -1629,10 +1692,11 @@ def main():  # pragma: no cover
         sys.exit(1)
     # establish connection and run checks
     connection = FDN_Connection(key)
-    cabin_cross_check(connection, args.patchall, args.update, args.infile,
+    cabin_cross_check(connection, args.patchall, args.update,
                       args.remote, args.lab, args.award)
-    # support for xlsx only - adjust if allowing different formats
-    workbook, sheetnames = digest_xlsx(args.infile)
+    # support for xlsx and google sheet url or sheets id
+    inputname, booktype = check_and_return_input_type(args.infile)
+    workbook, sheetnames = get_workbook(inputname, booktype)
 
     # This is not in our documentation, but if single sheet is used, file name can be the collection
     if args.type and 'all' not in args.type:
@@ -1646,7 +1710,7 @@ def main():  # pragma: no cover
     # we want to read through names in proper upload order
     sorted_names = order_sorter(names)
     # get all aliases from all sheets for dryrun object connections tests
-    aliases_by_type = get_all_aliases(workbook, sorted_names)
+    aliases_by_type = get_all_aliases(workbook, sorted_names, booktype)
     # all_aliases = list(aliases_by_type.keys())
     # dictionaries that accumulate information during submission
     dict_loadxl = {}
@@ -1656,14 +1720,14 @@ def main():  # pragma: no cover
     # accumulate = {dict_loadxl: {}, dict_replicates: {}, dict_exp_sets: {}}
     for n in sorted_names:
         if n.lower() in supported_collections:
-            workbook_reader(workbook, n, args.update, connection, args.patchall, aliases_by_type,
+            workbook_reader(workbook, booktype, n, args.update, connection, args.patchall, aliases_by_type,
                             dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields)
         elif n.lower() == "experimentmic_path":
-            workbook_reader(workbook, "ExperimentMic_Path", args.update, connection, args.patchall, aliases_by_type,
+            workbook_reader(workbook, booktype, "ExperimentMic_Path", args.update, connection, args.patchall, aliases_by_type,
                             dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields)
         elif n.lower().startswith('user_workflow'):
             if args.update:
-                user_workflow_reader(workbook, n, connection)
+                user_workflow_reader(workbook, booktype, n, connection)
             else:
                 print('user workflow sheets will only be processed with the --update argument')
         else:

From 849a4059a7740405c3a659f9aa7c542761c70f7d Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Tue, 3 May 2022 09:17:51 -0400
Subject: [PATCH 02/14] steps to auth for gsheet

---
 wranglertools/import_data.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index e41fc777..873bc27b 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -28,9 +28,11 @@
 from urllib import request as urllib2
 from contextlib import closing
 
+# for gsheet authentication
+
 
 EPILOG = '''
-This script takes in an Excel file with the data
+This script takes in a spreadsheet workbook file with the data
 This is a dryrun-default script, run with --update, --patchall or both (--update --patchall)
 to actually submit data to the portal
 
@@ -154,6 +156,8 @@ def getArgs():  # pragma: no cover
     'image/tiff',
 )
 
+G_API_CLIENT_ID = '258037973854-vgk9qvfsnps2gaca354bmrk80mmtf3do.apps.googleusercontent.com'
+
 
 def md5(path):
     md5sum = hashlib.md5()
@@ -1561,7 +1565,7 @@ def check_and_return_input_type(inputname):
     return inputname, 'gsheet'
 
 
-def cabin_cross_check(connection, patchall, update, remote, lab=None, award=None):
+def cabin_cross_check(connection, patchall, update, remote, booktype='excel', lab=None, award=None):
     """Set of check for connection, input, dryrun, and prompt."""
     print("Running on:       {server}".format(server=connection.key['server']))
     # check for multi labs and awards and reset connection appropriately
@@ -1603,6 +1607,10 @@ def cabin_cross_check(connection, patchall, update, remote, lab=None, award=None
                 print("Award {} not associated with lab {} - exiting!".format(submit_award, submit_lab))
                 sys.exit(1)
 
+    # if workbook is google sheet then do auth here and return credentials
+    if booktype == 'gsheet':
+        creds = do_authentication()
+
     print("Submitting User:  {}".format(connection.email))
     missing = []
     if connection.lab is None:
@@ -1692,12 +1700,13 @@ def main():  # pragma: no cover
         sys.exit(1)
     # establish connection and run checks
     connection = FDN_Connection(key)
-    cabin_cross_check(connection, args.patchall, args.update,
-                      args.remote, args.lab, args.award)
     # support for xlsx and google sheet url or sheets id
     inputname, booktype = check_and_return_input_type(args.infile)
     workbook, sheetnames = get_workbook(inputname, booktype)
 
+    cabin_cross_check(connection, args.patchall, args.update,
+                      args.remote, booktype, args.lab, args.award)
+
     # This is not in our documentation, but if single sheet is used, file name can be the collection
     if args.type and 'all' not in args.type:
         names = args.type

From 29f4b2d50db73a5bb05f42ac712b10f42c835e61 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Wed, 26 Oct 2022 13:09:11 -0400
Subject: [PATCH 03/14] finally figured out how google auth stuff can work

---
 .gitignore                   |   1 +
 wranglertools/import_data.py | 177 +++++++++++++++++++----------------
 2 files changed, 96 insertions(+), 82 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1b5be302..34b23060 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ Custom_scripts/
 Data_Files/MicroscopyCalibration/Files/
 .pytest_cache/
 .python-version
+wranglertools/.config/gspread/authorized_user.json
\ No newline at end of file
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index ba47e652..6d65f90d 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -2,71 +2,36 @@
 # -*- coding: latin-1 -*-
 """See the epilog for detailed information."""
 import argparse
-import pathlib as pp
-import hashlib
-from wranglertools.get_field_info import (
-    sheet_order, FDN_Key, FDN_Connection,
-    create_common_arg_parser, _remove_all_from_types)
-from dcicutils import ff_utils
-import openpyxl
-import gspread
-import warnings  # to suppress openpxl warning about headers
-from openpyxl.utils.exceptions import InvalidFileException
+import ast
 import datetime
-import sys
+import hashlib
 import mimetypes
-import requests
-from base64 import b64encode
-import magic  # install me with 'pip install python-magic'
-# https://github.com/ahupp/python-magic
-# this is the site for python-magic in case we need it
-import ast
 import os
-import time
-import subprocess
-import shutil
+import pathlib as pp
 import re
-from collections import OrderedDict, Counter
-from urllib import request as urllib2
+import shutil
+import subprocess
+import sys
+import time
+import warnings  # to suppress openpyxl warnings
+from base64 import b64encode
+from collections import Counter, OrderedDict
 from contextlib import closing
+from urllib import request as urllib2
 
-# for gsheet authentication
-
-
-EPILOG = '''
-This script takes in a spreadsheet workbook file with the data
-This is a dryrun-default script, run with --update, --patchall or both (--update --patchall)
-to actually submit data to the portal
-
-By DEFAULT:
-If there is a uuid, @id, accession, or previously submitted alias in the document:
-Use '--patchall' if you want to patch ALL objects in your document and ignore that message
-
-If you want to upload new items(no existing object identifiers are found),
-in the document you need to use '--update' for POSTing to occur
-
-Defining Object type:
-    Each "sheet" of the excel file is named after the object type you are uploading,
-    with the format used on http://data.4dnucleome.org//profiles/
-Ex: ExperimentHiC, Biosample, Document, BioFeature
-
-If you only want to submit a subset of sheets in a workbook use the --type option with the
-sheet name Ex: %(prog)s mydata.xsls --type ExperimentHiC
-
-The name of each sheet should be the names of the object type.
-Ex: Award, Lab, BioFeature, etc.
-
-The first row of the sheets should be the field names
-Ex: aliases, experiment_type, etc.
-
-To upload objects with attachments, use the column titled "attachment"
-containing the full path to the file you wish to attach
-
-To delete a field, use the keyword "*delete*" as the value.
-
-For more details:
-please see README.rst
-'''
+import gspread
+# https://github.com/ahupp/python-magic
+# this is the site for python-magic in case we need it
+import magic  # install me with 'pip install python-magic'
+import openpyxl
+import requests
+from dcicutils import ff_utils
+from gspread.exceptions import GSpreadException
+from openpyxl.utils.exceptions import InvalidFileException
+from wranglertools.get_field_info import (FDN_Connection, FDN_Key,
+                                          _remove_all_from_types,
+                                          create_common_arg_parser,
+                                          sheet_order)
 
 
 def getArgs():  # pragma: no cover
@@ -109,6 +74,42 @@ def getArgs():  # pragma: no cover
     return args
 
 
+EPILOG = '''
+This script takes in a spreadsheet workbook file with the data
+This is a dryrun-default script, run with --update, --patchall or both (--update --patchall)
+to actually submit data to the portal
+
+By DEFAULT:
+If there is a uuid, @id, accession, or previously submitted alias in the document:
+Use '--patchall' if you want to patch ALL objects in your document and ignore that message
+
+If you want to upload new items(no existing object identifiers are found),
+in the document you need to use '--update' for POSTing to occur
+
+Defining Object type:
+    Each "sheet" of the excel file is named after the object type you are uploading,
+    with the format used on http://data.4dnucleome.org//profiles/
+Ex: ExperimentHiC, Biosample, Document, BioFeature
+
+If you only want to submit a subset of sheets in a workbook use the --type option with the
+sheet name Ex: %(prog)s mydata.xlxs --type ExperimentHiC
+
+The name of each sheet should be the names of the object type.
+Ex: Award, Lab, BioFeature, etc.
+
+The first row of the sheets should be the field names
+Ex: aliases, experiment_type, etc.
+
+To upload objects with attachments, use the column titled "attachment"
+containing the full path to the file you wish to attach
+
+To delete a field, use the keyword "*delete*" as the value.
+
+For more details:
+please see README.rst
+'''
+
+
 # list of [sheet, [fields]] that need to be patched as a second step
 # should be in sync with loadxl.py in fourfront
 list_of_loadxl_fields = [
@@ -159,7 +160,8 @@ def getArgs():  # pragma: no cover
 )
 
 G_API_CLIENT_ID = '258037973854-vgk9qvfsnps2gaca354bmrk80mmtf3do.apps.googleusercontent.com'
-
+# If modifying these scopes, delete the file token.json.
+SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
 
 def md5(path_string):
     path = pp.Path(path_string).expanduser()
@@ -177,6 +179,18 @@ class WebFetchException(Exception):
     pass
 
 
+def authenticate():
+    gsauth = None
+    import pdb; pdb.set_trace()
+    creddir = pp.Path(__file__).parent.joinpath('.config', 'gspread')
+    gsauth = gspread.oauth(
+        credentials_filename=creddir.joinpath('credentials.json'),
+        authorized_user_filename=creddir.joinpath('authorized_user.json'),
+        scopes=SCOPES
+    )
+    return gsauth
+
+
 def mime_allowed(path, ok_mimes):
     filename = pp.PurePath(path).name
     guessed_mime = mimetypes.guess_type(path)[0]
@@ -271,18 +285,20 @@ def digest_xlsx(filename):
     return book, sheets
 
 
-def open_gsheets(gsid):
-    gc = gspread.service_account(filename='credentials.json')
-    wkbk = gc.open_by_key(gsid)
+def open_gsheets(gsid, gauth):
+    import pdb; pdb.set_trace()
+    wkbk = gauth.open_by_key(gsid)
     sheets = [sh.title for sh in wkbk.worksheets()]
     return wkbk, sheets
 
 
-def get_workbook(inputname, booktype):
+def get_workbook(inputname, booktype, gauth=None):
     if booktype == 'excel':
         return digest_xlsx(inputname)
     elif booktype == 'gsheet':
-        return open_gsheets(inputname)
+        if not gauth:
+            raise GSpreadException("Google authentication problem")
+        return open_gsheets(inputname, gauth)
 
 
 def reader(workbook, sheetname=None, booktype=None):
@@ -1587,7 +1603,7 @@ def check_and_return_input_type(inputname):
         xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
         # specific check for xlsx
         if not mime_allowed(inputname, xlsx_mime):
-            print(f"ERROR: File {infile} not recognized as excel file")
+            print(f"ERROR: File {inputname} not recognized as excel file")
             sys.exit(1)
         return inputname, 'excel'
     elif inputname.startswith('http'):
@@ -1602,13 +1618,10 @@ def check_and_return_input_type(inputname):
     return inputname, 'gsheet'
 
 
-def cabin_cross_check(connection, patchall, update, remote, booktype='excel', lab=None, award=None):
+
+def cabin_cross_check(connection, patchall, update, remote, lab=None, award=None):
     """Set of check for connection, input, dryrun, and prompt."""
     print("Running on:       {server}".format(server=connection.key['server']))
-        # check input file (xlsx)
-    if not pp.Path(infile).is_file():
-        print(f"File {infile} not found!")
-        sys.exit(1)
     # check for multi labs and awards and reset connection appropriately
     # if lab and/or award options used modify connection accordingly and check for conflict later
     if lab or award:
@@ -1648,10 +1661,6 @@ def cabin_cross_check(connection, patchall, update, remote, booktype='excel', la
                 print("Award {} not associated with lab {} - exiting!".format(submit_award, submit_lab))
                 sys.exit(1)
 
-    # if workbook is google sheet then do auth here and return credentials
-    if booktype == 'gsheet':
-        creds = do_authentication()
-
     print("Submitting User:  {}".format(connection.email))
     missing = []
     if connection.lab is None:
@@ -1674,10 +1683,7 @@ def cabin_cross_check(connection, patchall, update, remote, booktype='excel', la
         print("##############   DRY-RUN MODE   ################\n")
     else:
         if not remote:
-            try:
-                response = raw_input("Do you want to continue with these credentials? (Y/N): ") or "N"
-            except NameError:
-                response = input("Do you want to continue with these credentials? (Y/N): ") or "N"
+            response = input("Do you want to continue with these credentials? (Y/N): ") or "N"
             if response.lower() not in ["y", "yes"]:
                 sys.exit(1)
 
@@ -1743,10 +1749,17 @@ def main():  # pragma: no cover
     connection = FDN_Connection(key)
     # support for xlsx and google sheet url or sheets id
     inputname, booktype = check_and_return_input_type(args.infile)
-    workbook, sheetnames = get_workbook(inputname, booktype)
 
-    cabin_cross_check(connection, args.patchall, args.update,
-                      args.remote, booktype, args.lab, args.award)
+    cabin_cross_check(connection, args.patchall, args.update, args.remote, args.lab, args.award)
+
+    # need to google authenticate to allow gsheet to be read
+    gauth=None
+    if booktype == 'gsheet':
+        gauth = authenticate()
+    
+    workbook, sheetnames = get_workbook(inputname, booktype, gauth)
+
+   
 
     # This is not in our documentation, but if single sheet is used, file name can be the collection
     if args.type and 'all' not in args.type:

From 7a65c9a49107b292088638eddad11d3806db28d7 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Wed, 26 Oct 2022 15:27:04 -0400
Subject: [PATCH 04/14] fixing tests; considering removing ftp support

---
 tests/test_import_data.py    | 97 +++++++++++++++++++-----------------
 wranglertools/import_data.py |  8 +--
 2 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index c024661d..0ac7cdeb 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -1,21 +1,23 @@
-import wranglertools.import_data as imp
-import pytest
 import pathlib as pp
+
+import pytest
+import wranglertools.import_data as imp
+
 # test data is in conftest.py
 
 
-# @pytest.mark.file_operation
-@pytest.mark.ftp
-def test_attachment_from_ftp():
-    attach = imp.attachment("ftp://speedtest.tele2.net/1KB.zip")
-    assert attach
+# # @pytest.mark.file_operation
+# @pytest.mark.ftp
+# def test_attachment_from_ftp():
+#     attach = imp.attachment("ftp://speedtest.tele2.net/1KB.zip")
+#     assert attach
 
 
-@pytest.mark.ftp
-def test_attachment_ftp_to_nowhere():
-    with pytest.raises(Exception) as e:
-        imp.attachment("ftp://on/a/road/to/nowhere/blah.txt")
-    assert "urlopen error" in str(e.value)
+# @pytest.mark.ftp
+# def test_attachment_ftp_to_nowhere():
+#     with pytest.raises(Exception) as e:
+#         imp.attachment("ftp://on/a/road/to/nowhere/blah.txt")
+#     assert "urlopen error" in str(e.value)
 
 
 def convert_to_path_with_tilde(string_path):
@@ -76,7 +78,7 @@ def test_attachment_wrong_path():
 
 @pytest.mark.webtest
 def test_attachment_url():
-    attach = imp.attachment("http://example.com/index.html")
+    attach = imp.attachment("https://example.com/index.html")
     assert attach['download'] == 'index.html'
     assert attach['type'] == 'text/html'
     assert attach['href'].startswith('data:text/html;base64')
@@ -124,7 +126,8 @@ def test_reader_with_sheetname(vendor_raw_xls_fields, workbooks):
 
 @pytest.mark.file_operation
 def test_reader_wrong_sheetname(capsys):
-    msg = "string indices must be integers\nEnzyme\nERROR: Can not find the collection sheet in excel file (openpyxl error)\n"
+    msg = "string indices must be integers\nEnzyme\nERROR: Can not find the collection sheet in excel file" \
+        " (openpyxl error)\n"
     sheet = 'Vendor'
     sheetkey = "{}.xlsx".format(sheet)
     readxls = imp.reader(sheetkey, 'Enzyme')
@@ -282,7 +285,8 @@ def test_error_report(connection_mock):
                     {"name": "age",
                      "description": "'at' is not of type 'number'", "location": "body"},
                     {"name": "sex",
-                     "description": "'green' is not one of ['male', 'female', 'unknown', 'mixed']", "location": "body"}],
+                     "description": "'green' is not one of ['male', 'female', 'unknown', 'mixed']",
+                     "location": "body"}],
                 "code": 422,
                 "@type": ["ValidationFailure", "Error"],
                 "description": "Failed validation"}
@@ -340,7 +344,7 @@ def test_digest_xlsx(workbooks):
             assert book[sheet].max_column == workbook[sheet].max_column
 
 
-def test_workbooks_reader_no_update_no_patchall_new_doc_with_attachment(capsys, mocker, connection_mock, workbooks):
+def test_workbooks_reader_no_update_no_patchall_new_doc_with_attachment(mocker, connection_mock, workbooks):
     # test new item submission without patchall update tags and check the return message
     test_insert = 'Document_insert.xlsx'
     dict_load = {}
@@ -352,7 +356,7 @@ def test_workbooks_reader_no_update_no_patchall_new_doc_with_attachment(capsys,
     mocker.patch('wranglertools.import_data.remove_deleted', return_value={})
     # mocking the test post line
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value={'status': 'success'})
-    imp.workbook_reader(workbooks.get(test_insert), 'Document', False, connection_mock, False,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'Document', False, connection_mock, False,
                         all_aliases, dict_load, dict_rep, dict_set, True, ['attachment'])
     args = imp.remove_deleted.call_args
     attach = args[0][0]['attachment']
@@ -376,7 +380,8 @@ def test_workbook_reader_no_update_no_patchall_existing_item(capsys, mocker, con
     mocker.patch('wranglertools.import_data.get_existing', return_value=existing_vendor)
     mocker.patch('wranglertools.import_data.ff_utils.patch_metadata',
                  return_value={'status': 'success', '@graph': [{'uuid': 'uid1', '@id': '/vendor/test'}]})
-    imp.workbook_reader(workbooks.get(test_insert), 'Vendor', False, connection_mock, False, {}, dict_load, dict_rep, dict_set, True, [])
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'Vendor', False, connection_mock, False, {}, dict_load,
+                        dict_rep, dict_set, True, [])
     out = capsys.readouterr()
     args = imp.get_existing.call_args
     assert args[0][0] == post_json
@@ -396,12 +401,13 @@ def test_workbook_reader_post_ftp_file_upload(capsys, mocker, connection_mock, w
     # mock upload file and skip
     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
     # mock the ftp copy - this should get it's own tests
-    mocker.patch('wranglertools.import_data.ftp_copy', return_value=(True, {'md5sum': '0f343b0931126a20f133d67c2b018a3b'}, '1KB.zip'))
+    mocker.patch('wranglertools.import_data.ftp_copy',
+                 return_value=(True, {'md5sum': '0f343b0931126a20f133d67c2b018a3b'}, '1KB.zip'))
     # mock file deletion
     mocker.patch('wranglertools.import_data.pp.Path.unlink')
     # mock posting new items
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'FileCalibration', True, connection_mock, False,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
                         all_aliases, dict_load, dict_rep, dict_set, True, [])
     args = imp.ff_utils.post_metadata.call_args
     out = capsys.readouterr()[0]
@@ -431,7 +437,7 @@ def test_workbook_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_
     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
     # mock posting new items
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'FileCalibration', True, connection_mock, False,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
                         all_aliases, dict_load, dict_rep, dict_set, True, [])
     out = capsys.readouterr()[0]
     outlist = [i.strip() for i in out.split('\n') if i.strip()]
@@ -460,11 +466,11 @@ def test_workbook_reader_update_new_file_fastq_post_and_file_upload(capsys, mock
     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
     # mock posting new items
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'FileFastq', True, connection_mock, False,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileFastq', True, connection_mock, False,
                         all_aliases, dict_load, dict_rep, dict_set, True, [])
     args = imp.ff_utils.post_metadata.call_args
     out = capsys.readouterr()[0]
-    outlist = [i.strip() for i in out.split('\n') if i is not ""]
+    outlist = [i.strip() for i in out.split('\n') if i]
     post_json_arg = args[0][0]
     assert post_json_arg['md5sum'] == '8f8cc612e5b2d25c52b1d29017e38f2b'
     assert message0 == outlist[0]
@@ -500,7 +506,7 @@ def test_workbook_reader_patch_file_meta_and_file_upload(capsys, mocker, connect
     mocker.patch('dcicutils.ff_utils.patch_metadata', return_value=e)
     # mock get upload creds
     mocker.patch('wranglertools.import_data.get_upload_creds', return_value="new_creds")
-    imp.workbook_reader(workbooks.get(test_insert), 'FileFastq', False, connection_mock, True,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileFastq', False, connection_mock, True,
                         all_aliases, dict_load, dict_rep, dict_set, True, [])
     # check for md5sum
     args = imp.ff_utils.patch_metadata.call_args
@@ -512,7 +518,7 @@ def test_workbook_reader_patch_file_meta_and_file_upload(capsys, mocker, connect
     assert updated_post['@graph'][0]['upload_credentials'] == 'new_creds'
     # check for output message
     out = capsys.readouterr()[0]
-    outlist = [i.strip() for i in out.split('\n') if i is not ""]
+    outlist = [i.strip() for i in out.split('\n') if i]  # is not ""]
     assert message0 == outlist[0]
     assert message1 == outlist[1]
 
@@ -533,7 +539,7 @@ def test_workbook_reader_update_new_filefastq_meta_post(capsys, mocker, connecti
     mocker.patch('wranglertools.import_data.get_existing', return_value={})
     # mock posting new items
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'FileFastq', True, connection_mock, False,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileFastq', True, connection_mock, False,
                         all_aliases, dict_load, dict_rep, dict_set, True, [])
     args = imp.ff_utils.post_metadata.call_args
     out = capsys.readouterr()[0]
@@ -557,7 +563,7 @@ def test_workbook_reader_update_new_replicate_set_post(capsys, mocker, connectio
     mocker.patch('wranglertools.import_data.get_existing', return_value={})
     # mock upload file and skip
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'ExperimentSetReplicate', True, connection_mock,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'ExperimentSetReplicate', True, connection_mock,
                         False, all_aliases, dict_load, dict_rep, dict_set, True, [])
     args = imp.ff_utils.post_metadata.call_args
     out = capsys.readouterr()[0]
@@ -579,7 +585,7 @@ def test_workbook_reader_update_new_experiment_set_post(capsys, mocker, connecti
     mocker.patch('wranglertools.import_data.get_existing', return_value={})
     # mock upload file and skip
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'ExperimentSet', True, connection_mock, False,
+    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'ExperimentSet', True, connection_mock, False,
                         all_aliases, dict_load, dict_rep, dict_set, True, [])
     args = imp.ff_utils.post_metadata.call_args
     out = capsys.readouterr()[0]
@@ -631,18 +637,17 @@ def test_user_workflow_reader_wfr_post(capsys, mocker, connection_mock, workbook
     # mock formating files
     mocker.patch('wranglertools.import_data.format_file', side_effect=[
         {'bucket_name': 'elasticbeanstalk-fourfront-webdev-files', 'workflow_argument_name': 'chromsize',
-         'object_key': '4DNFI823LSII.chrom.sizes','uuid': '4a6d10ee-2edb-4402-a98f-0edb1d58f5e9'},
+         'object_key': '4DNFI823LSII.chrom.sizes', 'uuid': '4a6d10ee-2edb-4402-a98f-0edb1d58f5e9'},
         {'bucket_name': 'elasticbeanstalk-fourfront-webdev-wfoutput', 'workflow_argument_name': 'input_bams',
-         'object_key': ['4DNFIYI7YMVU.bam', '4DNFIPMZQNF5.bam'], 'uuid': [
-             '11c12207-6684-4346-9038-e7819dfde4e5', '4d55623a-1698-44c2-b111-1aa1379edc57'
-        ]},
+         'object_key': ['4DNFIYI7YMVU.bam', '4DNFIPMZQNF5.bam'],
+         'uuid': ['11c12207-6684-4346-9038-e7819dfde4e5', '4d55623a-1698-44c2-b111-1aa1379edc57']},
         {'bucket_name': 'elasticbeanstalk-fourfront-webdev-wfoutput', 'workflow_argument_name': 'annotated_bam',
          'object_key': '4DNFIVQPE4WT.bam', 'uuid': 'b0aaf32c-58de-475a-a222-3f16d3cb68f4'},
         {'bucket_name': 'elasticbeanstalk-fourfront-webdev-wfoutput', 'workflow_argument_name': 'filtered_pairs',
          'object_key': '4DNFIGOJW3XZ.pairs.gz', 'uuid': '0292e08e-facf-4a16-a94e-59606f2bfc71'}
     ])
     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.user_workflow_reader(workbooks.get(test_insert), sheet_name, connection_mock)
+    imp.user_workflow_reader(workbooks.get(test_insert), 'excel', sheet_name, connection_mock)
     args = imp.ff_utils.post_metadata.call_args
     out = capsys.readouterr()[0]
     print([i for i in args])
@@ -660,7 +665,7 @@ def test_order_sorter(capsys):
     message1 = '''WARNING! Check the sheet names and the reference list "sheet_order"'''
     assert ordered_list == imp.order_sorter(test_list)
     out = capsys.readouterr()[0]
-    outlist = [i.strip() for i in out.split('\n') if i is not ""]
+    outlist = [i.strip() for i in out.split('\n') if i]
     import sys
     if (sys.version_info > (3, 0)):
         assert message0 in outlist[0]
@@ -697,7 +702,7 @@ def test_cabin_cross_check_dryrun(mocker, connection_mock, capsys):
     mocker.patch('wranglertools.import_data._verify_and_return_item', side_effect=[
         {'awards': '/awards/test_award/'}, {'@id': '/awards/test_award/'}
     ])
-    imp.cabin_cross_check(connection_mock, False, False, './tests/data_files/workbooks/Exp_Set_insert.xlsx', False, None, None)
+    imp.cabin_cross_check(connection_mock, False, False, False, None, None)
     out = capsys.readouterr()[0]
     message = '''
 Running on:       https://data.4dnucleome.org/
@@ -718,7 +723,7 @@ def test_cabin_cross_check_remote_w_single_lab_award(mocker, connection_mock, ca
     mocker.patch('wranglertools.import_data._verify_and_return_item', side_effect=[
         {'awards': '/awards/test_award/'}, {'@id': '/awards/test_award/'}
     ])
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', True, None, None)
+    imp.cabin_cross_check(connection_mock, False, False, True, None, None)
     out = capsys.readouterr()[0]
     message = '''
 Running on:       https://data.4dnucleome.org/
@@ -741,7 +746,7 @@ def test_cabin_cross_check_not_remote_w_lab_award_options(mocker, connection_moc
         {'awards': '/awards/test_award/'}, {'@id': '/awards/test_award/'}
     ])
     connection_mock.labs = ['test_lab', 'other_lab']
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', False,
+    imp.cabin_cross_check(connection_mock, False, False, False,
                           '795847de-20b6-4f8c-ba8d-185215469cbf', 'c55dd1f0-433b-4714-bfce-8b3ae09f071c')
     out = capsys.readouterr()[0]
     print(out)
@@ -765,7 +770,7 @@ def test_cabin_cross_check_remote_w_lab_award_options(mocker, connection_mock, c
         {'awards': '/awards/test_award/'}, {'@id': '/awards/test_award/'}
     ])
     connection_mock.labs = ['test_lab', 'other_lab']
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', True,
+    imp.cabin_cross_check(connection_mock, False, False, True,
                           '795847de-20b6-4f8c-ba8d-185215469cbf', 'c55dd1f0-433b-4714-bfce-8b3ae09f071c')
     out = capsys.readouterr()[0]
     print(out)
@@ -791,7 +796,7 @@ def test_cabin_cross_check_remote_w_ok_award_and_no_lab_options(
     ])
     connection_mock.lab = '/labs/bing-ren-lab/'
     connection_mock.labs = ['/labs/bing-ren-lab/']
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', True, None, '/awards/1U54DK107977-01/')
+    imp.cabin_cross_check(connection_mock, False, False, True, None, '/awards/1U54DK107977-01/')
     out = capsys.readouterr()[0]
     print(out)
     message = '''
@@ -814,7 +819,7 @@ def test_cabin_cross_check_remote_w_multilabs_no_options(mocker, connection_mock
     connection_mock.labs = ['/labs/bing-ren-lab/', '/labs/test-lab/']
     connection_mock.award = None
     connection_mock.set_award = lambda x, y: None
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', True, None, None)
+    imp.cabin_cross_check(connection_mock, False, False, True, None, None)
     out = capsys.readouterr()[0]
     print(out)
     message = '''
@@ -841,7 +846,7 @@ def test_cabin_cross_check_remote_w_labopt_and_lab_has_single_award(mocker, conn
         {'awards': '/awards/test_award/'}, {'@id': '/awards/test_award/'}
     ])
     connection_mock.labs = ['test_lab', 'other_lab']
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', True, '/labs/test_lab/', None)
+    imp.cabin_cross_check(connection_mock, False, False, True, '/labs/test_lab/', None)
     out = capsys.readouterr()[0]
     print(out)
     message = '''
@@ -862,7 +867,7 @@ def test_cabin_cross_check_remote_w_unknown_lab_and_award(mocker, connection_moc
     mocker.patch('wranglertools.import_data.pp.Path.is_file', return_value=True)
     mocker.patch('wranglertools.import_data._verify_and_return_item', side_effect=[None, None])
     connection_mock.labs = ['test_lab', 'other_lab']
-    imp.cabin_cross_check(connection_mock, False, False, 'blah', True, 'unknown_lab', 'unknown_award')
+    imp.cabin_cross_check(connection_mock, False, False, True, 'unknown_lab', 'unknown_award')
     out = capsys.readouterr()[0]
     message = '''
 Running on:       https://data.4dnucleome.org/
@@ -889,14 +894,14 @@ def test_cabin_cross_check_remote_w_award_not_for_lab_options(mocker, connection
     ])
     with pytest.raises(SystemExit):
         connection_mock.labs = ['test_lab', '/labs/bing-ren-lab']
-        imp.cabin_cross_check(connection_mock, False, False, 'blah', True, '/labs/bing-ren-lab/', '/awards/non-ren-lab-award/')
+        imp.cabin_cross_check(connection_mock, False, False, True, '/labs/bing-ren-lab/', '/awards/non-ren-lab-award/')
 
 
 def test_get_all_aliases(workbooks):
     wbname = "Exp_Set_insert.xlsx"
     sheet = ["ExperimentSet"]
     my_aliases = {'sample_expset': 'ExperimentSet'}
-    all_aliases = imp.get_all_aliases(workbooks.get(wbname), sheet)
+    all_aliases = imp.get_all_aliases(workbooks.get(wbname), sheet, 'excel')
     assert my_aliases == all_aliases
 
 
@@ -988,8 +993,8 @@ def test_validate_multiple_items_in_alias_dict_incorrect_type(alias_dict, connec
     msg = imp.validate_item(items, 'Biosample', alias_dict, connection_mock)
     lns = msg.split('\n')
     assert len(lns) == 2
-    for l in lns:
-        assert l.startswith("ERROR")
+    for ln in lns:
+        assert ln.startswith("ERROR")
 
 
 def test_validate_item_not_in_alias_dict_alias_indb(mocker, connection_mock):
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index a2753edb..f3e02491 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -159,10 +159,10 @@ def getArgs():  # pragma: no cover
     'image/tiff',
 )
 
-G_API_CLIENT_ID = '258037973854-vgk9qvfsnps2gaca354bmrk80mmtf3do.apps.googleusercontent.com'
 # If modifying these scopes, delete the file token.json.
 SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
 
+
 def md5(path_string):
     path = pp.Path(path_string).expanduser()
     md5sum = hashlib.md5()
@@ -174,14 +174,13 @@ def md5(path_string):
 
 class WebFetchException(Exception):
     """
-    custom exception to raise if ftp or http fetch fails
+    custom exception to raise if http fetch fails
     """
     pass
 
 
 def authenticate():
     gsauth = None
-    import pdb; pdb.set_trace()
     creddir = pp.Path(__file__).parent.joinpath('.config', 'gspread')
     gsauth = gspread.oauth(
         credentials_filename=creddir.joinpath('credentials.json'),
@@ -286,7 +285,6 @@ def digest_xlsx(filename):
 
 
 def open_gsheets(gsid, gauth):
-    import pdb; pdb.set_trace()
     wkbk = gauth.open_by_key(gsid)
     sheets = [sh.title for sh in wkbk.worksheets()]
     return wkbk, sheets
@@ -358,8 +356,6 @@ def cell_value(cell):
     value = cell.value
     if ctype == openpyxl.cell.cell.TYPE_ERROR:  # pragma: no cover
         raise ValueError('Cell %s contains a cell error' % str(cell.coordinate))
-    elif value is None:
-        return ''
     elif ctype == openpyxl.cell.cell.TYPE_BOOL:
         boolstr = str(value).strip()
         if boolstr == 'TRUE':

From 1da1ff37ffb22da74f0df551b649a1149d0a8407 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Fri, 28 Oct 2022 13:38:10 -0400
Subject: [PATCH 05/14] removed all support for ftp download and upload; added
 tests

---
 tests/conftest.py            |  41 +++++++
 tests/test_import_data.py    | 227 ++++++++++++++++++++++++-----------
 wranglertools/import_data.py | 122 ++++++-------------
 3 files changed, 235 insertions(+), 155 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 8daa2d4d..1ff6967e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,6 +33,47 @@ def prompt_for_lab_award(self, lab=None, award=None):
         return
 
 
+class MockedGoogleWorkSheet(object):
+    ''' very basic mocked object to represent a gsheet sheet'''
+    def __init__(self, title='Sheet1', data={}):
+        self.title = title
+        self.data = data
+
+    def set_title(self, title=''):
+        if title:
+            self.title = title
+    
+    def set_data(self, data={}):
+        self.data = data
+
+    def get_values(self):
+        return self.data.values()
+
+
+class MockedGoogleWorkBook(object):
+    ''' basic mocked google workbook '''
+    def __init__(self, gsid='1111', sheets=[]):
+        self.gsid = gsid
+        self.sheets = sheets
+
+    def add_sheets(self, sheets=[]):
+        self.sheets.extend(sheets)
+
+
+    def worksheets(self):
+        return self.sheets
+
+    def get_worksheet(self, idx):
+        '''return sheet at idx in sheet list'''
+        return self.sheets[idx]
+
+    def worksheet(self, title):
+        for sheet in self.sheets:
+            if sheet.title == title:
+                return sheet
+        raise Exception
+
+
 @pytest.fixture
 def connection_mock():
     keypairs = {
diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index 0ac7cdeb..8d605ccf 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -1,25 +1,15 @@
 import pathlib as pp
+from plistlib import InvalidFileException
+from gspread.exceptions import GSpreadException
 
 import pytest
+import inspect
 import wranglertools.import_data as imp
+from tests.conftest import MockedGoogleWorkSheet, MockedGoogleWorkBook
 
 # test data is in conftest.py
 
 
-# # @pytest.mark.file_operation
-# @pytest.mark.ftp
-# def test_attachment_from_ftp():
-#     attach = imp.attachment("ftp://speedtest.tele2.net/1KB.zip")
-#     assert attach
-
-
-# @pytest.mark.ftp
-# def test_attachment_ftp_to_nowhere():
-#     with pytest.raises(Exception) as e:
-#         imp.attachment("ftp://on/a/road/to/nowhere/blah.txt")
-#     assert "urlopen error" in str(e.value)
-
-
 def convert_to_path_with_tilde(string_path):
     """Somehow the inverse of pathlib.Path.expanduser(). Helper function used
     to generate valid paths containing ~ """
@@ -136,6 +126,61 @@ def test_reader_wrong_sheetname(capsys):
     assert out == msg
 
 
+
+@pytest.fixture
+def gs_test_data():
+    return {'row1': ['a', 'b', 'c'], 'row2': ['d', 'e', 'f']}
+
+
+@pytest.fixture
+def mock_gsheet(gs_test_data):
+    msheet = MockedGoogleWorkSheet()
+    msheet.set_data(gs_test_data)
+    return msheet
+
+
+def test_reader_gsheet_no_name(mock_gsheet):
+    test_wkbk = MockedGoogleWorkBook()
+    test_wkbk.add_sheets([mock_gsheet])
+    res = imp.reader(test_wkbk, booktype='gsheet')
+    assert inspect.isgenerator(res)
+
+
+def test_reader_gsheet_w_name(mock_gsheet):
+    sheetname = 'TestSheet2'
+    test_row_data = ['x', 'y']
+    mock_sheet2 = MockedGoogleWorkSheet()
+    mock_sheet2.set_title(sheetname)
+    mock_sheet2.set_data({'row1': test_row_data})
+    test_wkbk = MockedGoogleWorkBook()
+    test_wkbk.add_sheets([mock_gsheet, mock_sheet2])
+    res = imp.reader(test_wkbk, sheetname=sheetname, booktype='gsheet')
+    assert inspect.isgenerator(res)
+    res_data = list(res)
+    assert len(res_data) == 1
+    assert res_data[0] == test_row_data
+
+
+def test_reader_gsheet_bad_name(mock_gsheet, capsys):
+    badname = 'NoSuchName'
+    errmsg = '\nNoSuchName\nERROR: Can not find the collection sheet in excel file (gspread error)\n'
+    test_wkbk = MockedGoogleWorkBook()
+    test_wkbk.add_sheets([mock_gsheet])
+    res = imp.reader(test_wkbk, sheetname=badname, booktype='gsheet')
+    assert res is None
+    out = capsys.readouterr()[0]
+    assert out == errmsg
+
+
+def test_row_generator_gsheet(mock_gsheet, gs_test_data):
+    res = imp.row_generator(mock_gsheet, 'gsheet')
+    # import pdb; pdb.set_trace()
+    assert(inspect.isgenerator(res))
+    lres = list(res)
+    assert len(lres) == 2
+    assert lres[0] == gs_test_data['row1']
+
+
 def test_cell_value(workbooks):
     readxls = imp.reader(workbooks.get('test_cell_values.xlsx'))
     list_readxls = list(readxls)
@@ -344,6 +389,46 @@ def test_digest_xlsx(workbooks):
             assert book[sheet].max_column == workbook[sheet].max_column
 
 
+def test_digest_xlsx_error_on_xls(capsys):
+    test_filename = 'test.xls'
+    with pytest.raises(SystemExit):
+        with pytest.raises(InvalidFileException):
+            imp.digest_xlsx(test_filename)
+            out = capsys.readouterr()[0]
+            assert 'WARNING - Old xls format not supported' in out
+
+
+def test_digest_xlsx_error_on_badext(capsys):
+    test_filename = 'test.ods'
+    with pytest.raises(SystemExit):
+        with pytest.raises(InvalidFileException):
+            imp.digest_xlsx(test_filename)
+            out = capsys.readouterr()[0]
+            assert "ERROR - " in out
+
+
+def test_get_workbook_excel(mocker):
+    filename = 'test.xlsx'
+    retval = 'digested excel'
+    mocker.patch('wranglertools.import_data.digest_xlsx', return_value=retval)
+    val = imp.get_workbook(filename, 'excel')
+    assert val == retval
+
+
+def test_get_workbook_gsheet(mocker):
+    filename = 'http://docs.google.com/test_sheet'
+    retval = 'digested gsheet'
+    mocker.patch('wranglertools.import_data.open_gsheets', return_value=retval)
+    val = imp.get_workbook(filename, 'gsheet', True)
+    assert val == retval
+
+
+def test_get_workbook_gsheet_fail_w_no_auth():
+    filename = 'http://docs.google.com/test_sheet'
+    with pytest.raises(GSpreadException):
+        imp.get_workbook(filename, 'gsheet')
+
+
 def test_workbooks_reader_no_update_no_patchall_new_doc_with_attachment(mocker, connection_mock, workbooks):
     # test new item submission without patchall update tags and check the return message
     test_insert = 'Document_insert.xlsx'
@@ -388,62 +473,62 @@ def test_workbook_reader_no_update_no_patchall_existing_item(capsys, mocker, con
     assert out[0] == message
 
 
-def test_workbook_reader_post_ftp_file_upload(capsys, mocker, connection_mock, workbooks):
-    test_insert = 'Ftp_file_test_md5.xlsx'
-    dict_load = {}
-    dict_rep = {}
-    dict_set = {}
-    all_aliases = {}
-    message1 = "FILECALIBRATION(1)         :  1 posted / 0 not posted       0 patched / 0 not patched, 0 errors\n"
-    e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]}
-    # mock fetching existing info, return None
-    mocker.patch('wranglertools.import_data.get_existing', return_value={})
-    # mock upload file and skip
-    mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
-    # mock the ftp copy - this should get it's own tests
-    mocker.patch('wranglertools.import_data.ftp_copy',
-                 return_value=(True, {'md5sum': '0f343b0931126a20f133d67c2b018a3b'}, '1KB.zip'))
-    # mock file deletion
-    mocker.patch('wranglertools.import_data.pp.Path.unlink')
-    # mock posting new items
-    mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
-                        all_aliases, dict_load, dict_rep, dict_set, True, [])
-    args = imp.ff_utils.post_metadata.call_args
-    out = capsys.readouterr()[0]
-    post_json_arg = args[0][0]
-    assert post_json_arg['md5sum'] == '0f343b0931126a20f133d67c2b018a3b'
-    assert message1 == out
-
-
-def test_workbook_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_mock, workbooks):
-    """ This appears to actually mainly be testing the ftp_copy function - confirming that
-        the correct error messages are generated when you try to copy an ftp file without
-        including an md5sum in the post and subsequently that the workbook_reader function
-        will still post the metadata without uploading a file
-    """
-    test_insert = 'Ftp_file_test.xlsx'
-    dict_load = {}
-    dict_rep = {}
-    dict_set = {}
-    all_aliases = {}
-    message0 = "WARNING: File not uploaded"
-    message1 = "Please add original md5 values of the files"
-    message2 = "FILECALIBRATION(1)         :  1 posted / 0 not posted       0 patched / 0 not patched, 0 errors"
-    e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]}
-    # mock fetching existing info, return None
-    mocker.patch('wranglertools.import_data.get_existing', return_value={})
-    # mock upload file and skip
-    mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
-    # mock posting new items
-    mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-    imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
-                        all_aliases, dict_load, dict_rep, dict_set, True, [])
-    out = capsys.readouterr()[0]
-    outlist = [i.strip() for i in out.split('\n') if i.strip()]
-    assert message0 == outlist[0]
-    assert message1 == outlist[1]
-    assert message2 == outlist[2]
+# def test_workbook_reader_post_ftp_file_upload(capsys, mocker, connection_mock, workbooks):
+#     test_insert = 'Ftp_file_test_md5.xlsx'
+#     dict_load = {}
+#     dict_rep = {}
+#     dict_set = {}
+#     all_aliases = {}
+#     message1 = "FILECALIBRATION(1)         :  1 posted / 0 not posted       0 patched / 0 not patched, 0 errors\n"
+#     e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]}
+#     # mock fetching existing info, return None
+#     mocker.patch('wranglertools.import_data.get_existing', return_value={})
+#     # mock upload file and skip
+#     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
+#     # mock the ftp copy - this should get it's own tests
+#     mocker.patch('wranglertools.import_data.ftp_copy',
+#                  return_value=(True, {'md5sum': '0f343b0931126a20f133d67c2b018a3b'}, '1KB.zip'))
+#     # mock file deletion
+#     mocker.patch('wranglertools.import_data.pp.Path.unlink')
+#     # mock posting new items
+#     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
+#     imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
+#                         all_aliases, dict_load, dict_rep, dict_set, True, [])
+#     args = imp.ff_utils.post_metadata.call_args
+#     out = capsys.readouterr()[0]
+#     post_json_arg = args[0][0]
+#     assert post_json_arg['md5sum'] == '0f343b0931126a20f133d67c2b018a3b'
+#     assert message1 == out
+
+
+# def test_workbook_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_mock, workbooks):
+#     """ This appears to actually mainly be testing the ftp_copy function - confirming that
+#         the correct error messages are generated when you try to copy an ftp file without
+#         including an md5sum in the post and subsequently that the workbook_reader function
+#         will still post the metadata without uploading a file
+#     """
+#     test_insert = 'Ftp_file_test.xlsx'
+#     dict_load = {}
+#     dict_rep = {}
+#     dict_set = {}
+#     all_aliases = {}
+#     message0 = "WARNING: File not uploaded"
+#     message1 = "Please add original md5 values of the files"
+#     message2 = "FILECALIBRATION(1)         :  1 posted / 0 not posted       0 patched / 0 not patched, 0 errors"
+#     e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]}
+#     # mock fetching existing info, return None
+#     mocker.patch('wranglertools.import_data.get_existing', return_value={})
+#     # mock upload file and skip
+#     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
+#     # mock posting new items
+#     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
+#     imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
+#                         all_aliases, dict_load, dict_rep, dict_set, True, [])
+#     out = capsys.readouterr()[0]
+#     outlist = [i.strip() for i in out.split('\n') if i.strip()]
+#     assert message0 == outlist[0]
+#     assert message1 == outlist[1]
+#     assert message2 == outlist[2]
 
 
 @pytest.mark.file_operation
@@ -892,7 +977,7 @@ def test_cabin_cross_check_remote_w_award_not_for_lab_options(mocker, connection
     mocker.patch('wranglertools.import_data._verify_and_return_item', side_effect=[
         {'awards': ['/awards/test_award/', '/awards/1U54DK107977-01/']}, {'@id': '/awards/non-ren-lab-award/'}
     ])
-    with pytest.raises(SystemExit):
+    with pytest.raises (SystemExit):
         connection_mock.labs = ['test_lab', '/labs/bing-ren-lab']
         imp.cabin_cross_check(connection_mock, False, False, True, '/labs/bing-ren-lab/', '/awards/non-ren-lab-award/')
 
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index f3e02491..a212a4f0 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -9,14 +9,13 @@
 import os
 import pathlib as pp
 import re
-import shutil
 import subprocess
 import sys
 import time
 import warnings  # to suppress openpyxl warnings
 from base64 import b64encode
 from collections import Counter, OrderedDict
-from contextlib import closing
+# from contextlib import closing
 from urllib import request as urllib2
 
 import gspread
@@ -179,7 +178,7 @@ class WebFetchException(Exception):
     pass
 
 
-def authenticate():
+def google_authenticate():
     gsauth = None
     creddir = pp.Path(__file__).parent.joinpath('.config', 'gspread')
     gsauth = gspread.oauth(
@@ -208,43 +207,31 @@ def mime_allowed(path, ok_mimes):
 
 def attachment(path):
     """Create an attachment upload object from a filename and embed the attachment as a data url.
-       NOTE: a url or ftp can be used but path must end in filename with extension that will match
+       NOTE: a url can be used but path must end in filename with extension that will match
        the magic detected MIME type of that file and be one of the allowed mime types
     """
-    ftp_attach = False
+    url_attach = False
     if path.startswith('~'):
         path = str(pp.Path(path).expanduser())
     if not pp.Path(path).is_file():
         # if the path does not exist, check if it works as a URL
-        if path.startswith("ftp://"):  # grab the file from ftp
-            print("\nINFO: Attempting to download file from this url %s" % path)
-            try:
-                with closing(urllib2.urlopen(path)) as r:
-                    file_name = path.split("/")[-1]
-                    with open(file_name, 'wb') as f:
-                        shutil.copyfileobj(r, f)
-                        path = file_name
-                        ftp_attach = True
-            except urllib2.URLError as e:
-                raise WebFetchException("\nERROR : FTP fetch for 'attachment' failed - {}".format(e))
+        try:
+            r = requests.get(path)
+        except Exception:
+            raise WebFetchException(
+                "\nERROR : The 'attachment' field has INVALID FILE PATH or URL ({})\n".format(path))
         else:
-            try:
-                r = requests.get(path)
-            except Exception:
-                raise WebFetchException(
-                    "\nERROR : The 'attachment' field has INVALID FILE PATH or URL ({})\n".format(path))
-            else:
-                # if it works as a URL, but does not return 200
-                if r.status_code != 200:  # pragma: no cover
-                    raise Exception("\nERROR : The 'attachment' field has INVALID URL ({})\n".format(path))
-            # parse response
-            path = path.split("/")[-1]
-            try:
-                with open(path, "wb") as outfile:
-                    outfile.write(r.content)
-                    ftp_attach = True
-            except Exception as e:
-                raise Exception("\nERROR : Cannot write a tmp file to disk - {}".format(e))
+            # if it works as a URL, but does not return 200
+            if r.status_code != 200:  # pragma: no cover
+                raise Exception("\nERROR : The 'attachment' field has INVALID URL ({})\n".format(path))
+        # parse response
+        path = path.split("/")[-1]
+        try:
+            with open(path, "wb") as outfile:
+                outfile.write(r.content)
+                url_attach = True
+        except Exception as e:
+            raise Exception("\nERROR : Cannot write a tmp file to disk - {}".format(e))
 
     attach = {}
     filename = pp.PurePath(path).name
@@ -264,7 +251,7 @@ def attachment(path):
             'type': guessed_mime,
             'href': 'data:%s;base64,%s' % (guessed_mime, b64encode(stream.read()).decode('ascii'))
         }
-    if ftp_attach:
+    if url_attach:
         pp.Path(path).unlink()
     return attach
 
@@ -419,7 +406,7 @@ def get_sub_field(field_name):
     """Construct embeded field names."""
     try:
         return field_name.split(".")[1].rstrip('-0123456789')
-    except:  # pragma: no cover
+    except Exception:  # pragma: no cover
         return ''
 
 
@@ -442,7 +429,7 @@ def get_sub_field_number(field_name):
     field = field_name.split(":")[0]
     try:
         return int(field.split("-")[1])
-    except:
+    except Exception:
         return 0
 
 
@@ -487,7 +474,7 @@ def parse_exception(e):
         resp_dict = ast.literal_eval(resp_text)
         return resp_dict
     # if not re-raise
-    except:  # pragma: no cover
+    except Exception:  # pragma: no cover
         raise e
 
 
@@ -871,13 +858,13 @@ def error_report(error_dic, sheet, all_aliases, connection, error_id=''):
         try:
             report.append("{sheet:<30}{eid}: {des}"
                           .format(des=error_description, eid=error_id, sheet="ERROR " + sheet.lower()))
-        except:
+        except Exception:
             return error_dic
     # if there is a conflict
     elif error_dic.get('title') == "Conflict":
         try:
             report.extend(conflict_error_report(error_dic, sheet, connection))
-        except:
+        except Exception:
             return error_dic
     # if nothing works, give the full error, we should add that case to our reporting
     else:
@@ -909,7 +896,7 @@ def conflict_error_report(error_dic, sheet, connection):
                 existing_item = ff_utils.search_metadata(search, key=connection.key)
                 at_id = existing_item.get('@id')
                 add_text = "please use " + at_id
-            except:
+            except Exception:
                 # if there is a conflicting item, but it is not viewable by the user,
                 # we should release the item to the project/public
                 add_text = "please contact DCIC"
@@ -917,17 +904,11 @@ def conflict_error_report(error_dic, sheet, connection):
                             .format(er=error_field, des=error_value, sheet="ERROR " + sheet.lower(), at=add_text))
         all_conflicts.append(conflict_rep)
         return all_conflicts
-    except:
+    except Exception:
         return
 
 
 def update_item(verb, file_to_upload, post_json, filename_to_post, extrafiles, connection, identifier):
-    # if FTP, grab the file from ftp
-    ftp_download = False
-    if file_to_upload and filename_to_post.startswith("ftp://"):
-        ftp_download = True
-        file_to_upload, post_json, filename_to_post = ftp_copy(filename_to_post, post_json)
-    # add the md5
     if file_to_upload and not post_json.get('md5sum'):
         print("calculating md5 sum for file %s " % (filename_to_post))
         post_json['md5sum'] = md5(filename_to_post)
@@ -949,15 +930,13 @@ def update_item(verb, file_to_upload, post_json, filename_to_post, extrafiles, c
             e['@graph'][0]['upload_credentials'] = creds
         # upload
         upload_file_item(e, filename_to_post)
-        if ftp_download:
-            pp.Path(filename_to_post).unlink()
     if extrafiles:
         extcreds = e['@graph'][0].get('extra_files_creds')
         for fformat, filepath in extrafiles.items():
             try:
                 file_format = ff_utils.get_metadata(fformat, key=connection.key)
                 ff_uuid = file_format.get('uuid')
-            except:
+            except Exception:
                 raise "Can't find file_format item for %s" % fformat
             for ecred in extcreds:
                 if ff_uuid == ecred.get('file_format'):
@@ -975,29 +954,6 @@ def post_item(file_to_upload, post_json, filename_to_post, extrafiles, connectio
     return update_item('POST', file_to_upload, post_json, filename_to_post, extrafiles, connection, sheet)
 
 
-def ftp_copy(filename_to_post, post_json):
-    """Downloads the file from the server, and reformats post_json."""
-    if not post_json.get("md5sum"):
-        # if the file is from the server, the md5 should be supplied by the user.
-        print("\nWARNING: File not uploaded")
-        print("Please add original md5 values of the files")
-        return False, post_json, ""
-    try:
-        # download the file from the server
-        # return new file location to upload from
-        print("\nINFO: Attempting to download file from this url to your computer before upload %s" % filename_to_post)
-        with closing(urllib2.urlopen(filename_to_post)) as r:
-            new_file = post_json['filename']
-            with open(new_file, 'wb') as f:
-                shutil.copyfileobj(r, f)
-        return True, post_json, new_file
-    except:
-        # if download did not work, delete the filename from the post json
-        print("WARNING: Download failed")
-        post_json.pop('filename')
-        return False, post_json, ""
-
-
 def delete_fields(post_json, connection, existing_data):
     """Deletes fields with the value '*delete*'."""
     # find fields to be removed
@@ -1156,7 +1112,7 @@ def workbook_reader(workbook, booktype, sheet, update, connection, patchall, ali
     on the options passed in.
     """
     # determine right from the top if dry run
-    dryrun = not(update or patchall)
+    dryrun = not (update or patchall)
     all_aliases = [k for k in aliases_by_type]
     # dict for acumulating cycle patch data
     patch_loadxl = []
@@ -1520,7 +1476,7 @@ def upload_file(creds, path):  # pragma: no cover
             'AWS_SECURITY_TOKEN': creds['SessionToken'],
         })
     except Exception as e:
-        raise("Didn't get back s3 access keys from file/upload endpoint.  Error was %s" % str(e))
+        raise ("Didn't get back s3 access keys from file/upload endpoint.  Error was %s" % str(e))
     # ~10s/GB from Stanford - AWS Oregon
     # ~12-15s/GB from AWS Ireland - AWS Oregon
     print("Uploading file.")
@@ -1616,7 +1572,6 @@ def check_and_return_input_type(inputname):
     return inputname, 'gsheet'
 
 
-
 def cabin_cross_check(connection, patchall, update, remote, lab=None, award=None):
     """Set of check for connection, input, dryrun, and prompt."""
     print("Running on:       {server}".format(server=connection.key['server']))
@@ -1718,7 +1673,7 @@ def get_all_aliases(workbook, sheets, booktype):
         keys = next(rows)  # grab the first row of headers
         try:
             alias_col = keys.index("aliases")
-        except:
+        except Exception:
             continue
         for row in rows:
             my_aliases = []
@@ -1751,13 +1706,11 @@ def main():  # pragma: no cover
     cabin_cross_check(connection, args.patchall, args.update, args.remote, args.lab, args.award)
 
     # need to google authenticate to allow gsheet to be read
-    gauth=None
+    gauth = None
     if booktype == 'gsheet':
-        gauth = authenticate()
-    
-    workbook, sheetnames = get_workbook(inputname, booktype, gauth)
+        gauth = google_authenticate()
 
-   
+    workbook, sheetnames = get_workbook(inputname, booktype, gauth)
 
     # This is not in our documentation, but if single sheet is used, file name can be the collection
     if args.type and 'all' not in args.type:
@@ -1784,8 +1737,9 @@ def main():  # pragma: no cover
             workbook_reader(workbook, booktype, n, args.update, connection, args.patchall, aliases_by_type,
                             dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields)
         elif n.lower() == "experimentmic_path":
-            workbook_reader(workbook, booktype, "ExperimentMic_Path", args.update, connection, args.patchall, aliases_by_type,
-                            dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields)
+            workbook_reader(workbook, booktype, "ExperimentMic_Path", args.update, connection, args.patchall,
+                            aliases_by_type, dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate,
+                            attachment_fields)
         elif n.lower().startswith('user_workflow'):
             if args.update:
                 user_workflow_reader(workbook, booktype, n, connection)

From eea16bcf930157d7a312d0d59d031a60f403c7e8 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Fri, 28 Oct 2022 14:09:49 -0400
Subject: [PATCH 06/14] added gspread dependency; updated version

---
 poetry.lock    | 238 ++++++++++++++++++++++++++++++-------------------
 pyproject.toml |   3 +-
 2 files changed, 150 insertions(+), 91 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7e29d90d..4329d278 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -25,14 +25,14 @@ requests = ">=0.14.0"
 
 [[package]]
 name = "awscli"
-version = "1.25.65"
+version = "1.26.3"
 description = "Universal Command Line Environment for AWS."
 category = "main"
 optional = false
 python-versions = ">= 3.7"
 
 [package.dependencies]
-botocore = "1.27.64"
+botocore = "1.28.3"
 colorama = ">=0.2.5,<0.4.5"
 docutils = ">=0.10,<0.17"
 PyYAML = ">=3.10,<5.5"
@@ -56,14 +56,14 @@ lxml = ["lxml"]
 
 [[package]]
 name = "boto3"
-version = "1.24.64"
+version = "1.25.3"
 description = "The AWS SDK for Python"
 category = "main"
 optional = false
 python-versions = ">= 3.7"
 
 [package.dependencies]
-botocore = ">=1.27.64,<1.28.0"
+botocore = ">=1.28.3,<1.29.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.6.0,<0.7.0"
 
@@ -72,7 +72,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.27.64"
+version = "1.28.3"
 description = "Low-level, data-driven core of boto 3."
 category = "main"
 optional = false
@@ -86,9 +86,17 @@ urllib3 = ">=1.25.4,<1.27"
 [package.extras]
 crt = ["awscrt (==0.14.0)"]
 
+[[package]]
+name = "cachetools"
+version = "5.2.0"
+description = "Extensible memoizing collections and decorators"
+category = "main"
+optional = false
+python-versions = "~=3.7"
+
 [[package]]
 name = "certifi"
-version = "2022.6.15"
+version = "2022.9.24"
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 optional = false
@@ -115,7 +123,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
 [[package]]
 name = "coverage"
-version = "6.4.4"
+version = "6.5.0"
 description = "Code coverage measurement for Python"
 category = "dev"
 optional = false
@@ -126,18 +134,18 @@ toml = ["tomli"]
 
 [[package]]
 name = "dcicutils"
-version = "4.5.0"
+version = "6.0.0"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 category = "main"
 optional = false
-python-versions = ">=3.6.1,<3.10"
+python-versions = ">=3.7,<3.10"
 
 [package.dependencies]
 aws-requests-auth = ">=0.4.2,<1"
 boto3 = ">=1.17.39,<2.0.0"
 botocore = ">=1.20.39,<2.0.0"
 docker = ">=4.4.4,<5.0.0"
-elasticsearch = "6.8.1"
+elasticsearch = "7.13.4"
 gitpython = ">=3.1.2,<4.0.0"
 pytz = ">=2020.4"
 PyYAML = ">=5.1,<5.5"
@@ -177,17 +185,20 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
 [[package]]
 name = "elasticsearch"
-version = "6.8.1"
+version = "7.13.4"
 description = "Python client for Elasticsearch"
 category = "main"
 optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
 
 [package.dependencies]
-urllib3 = ">=1.21.1"
+certifi = "*"
+urllib3 = ">=1.21.1,<2"
 
 [package.extras]
-develop = ["requests (>=2.0.0,<3.0.0)", "nose", "coverage", "mock", "pyyaml", "nosexcover", "numpy", "pandas", "sphinx (<1.7)", "sphinx-rtd-theme"]
+async = ["aiohttp (>=3,<4)"]
+develop = ["requests (>=2.0.0,<3.0.0)", "coverage", "mock", "pyyaml", "pytest", "pytest-cov", "sphinx (<1.7)", "sphinx-rtd-theme", "black", "jinja2"]
+docs = ["sphinx (<1.7)", "sphinx-rtd-theme"]
 requests = ["requests (>=2.4.0,<3.0.0)"]
 
 [[package]]
@@ -211,7 +222,7 @@ smmap = ">=3.0.1,<6"
 
 [[package]]
 name = "gitpython"
-version = "3.1.27"
+version = "3.1.29"
 description = "GitPython is a python library used to interact with Git repositories"
 category = "main"
 optional = false
@@ -221,9 +232,56 @@ python-versions = ">=3.7"
 gitdb = ">=4.0.1,<5"
 typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""}
 
+[[package]]
+name = "google-auth"
+version = "2.13.0"
+description = "Google Authentication Library"
+category = "main"
+optional = false
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*"
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = {version = ">=3.1.4,<5", markers = "python_version >= \"3.6\""}
+six = ">=1.9.0"
+
+[package.extras]
+aiohttp = ["requests (>=2.20.0,<3.0.0dev)", "aiohttp (>=3.6.2,<4.0.0dev)"]
+enterprise_cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"]
+pyopenssl = ["pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+
+[[package]]
+name = "google-auth-oauthlib"
+version = "0.7.0"
+description = "Google Authentication Library"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+google-auth = ">=2.13.0"
+requests-oauthlib = ">=0.7.0"
+
+[package.extras]
+tool = ["click (>=6.0.0)"]
+
+[[package]]
+name = "gspread"
+version = "5.6.2"
+description = "Google Spreadsheets Python API"
+category = "main"
+optional = false
+python-versions = ">=3.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+google-auth = ">=1.12.0"
+google-auth-oauthlib = ">=0.4.1"
+
 [[package]]
 name = "idna"
-version = "3.3"
+version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
 category = "main"
 optional = false
@@ -237,6 +295,19 @@ category = "main"
 optional = false
 python-versions = ">=3.7"
 
+[[package]]
+name = "oauthlib"
+version = "3.2.2"
+description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+rsa = ["cryptography (>=3.0.0)"]
+signals = ["blinker (>=1.4.0)"]
+signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+
 [[package]]
 name = "openpyxl"
 version = "3.0.10"
@@ -264,6 +335,17 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "pyasn1-modules"
+version = "0.2.8"
+description = "A collection of ASN.1-based protocols modules."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.5.0"
+
 [[package]]
 name = "pytest"
 version = "3.0.1"
@@ -323,7 +405,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
 [[package]]
 name = "pytz"
-version = "2022.2.1"
+version = "2022.5"
 description = "World timezone definitions, modern and historical"
 category = "main"
 optional = false
@@ -363,6 +445,21 @@ urllib3 = ">=1.21.1,<1.27"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "requests-oauthlib"
+version = "1.3.1"
+description = "OAuthlib authentication support for Requests."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+oauthlib = ">=3.0.0"
+requests = ">=2.0.0"
+
+[package.extras]
+rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
+
 [[package]]
 name = "rfc3986"
 version = "1.5.0"
@@ -450,7 +547,7 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 
 [[package]]
 name = "typing-extensions"
-version = "4.3.0"
+version = "4.4.0"
 description = "Backported and Experimental Type Hints for Python 3.7+"
 category = "main"
 optional = false
@@ -495,7 +592,7 @@ testing = ["pytest (>=3.1.0)", "coverage", "pytest-cov", "pytest-xdist"]
 
 [[package]]
 name = "websocket-client"
-version = "1.4.0"
+version = "1.4.1"
 description = "WebSocket client for Python with low level API options"
 category = "main"
 optional = false
@@ -527,7 +624,7 @@ tests = ["nose (<1.3.0)", "coverage", "mock", "pastedeploy", "wsgiproxy2", "pyqu
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.7.0,<3.10"
-content-hash = "c851ac9587a6baefead93f0f4361e6e7b2df380344cf822228d6bb2fcb64b683"
+content-hash = "f4a7b730d3a69485dd4bc9bc4bff4f54eb2ce2f5b38b35d61d17b70a03832695"
 
 [metadata.files]
 attrs = [
@@ -545,6 +642,7 @@ beautifulsoup4 = [
 ]
 boto3 = []
 botocore = []
+cachetools = []
 certifi = []
 charset-normalizer = [
     {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"},
@@ -554,68 +652,14 @@ colorama = [
     {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
     {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
 ]
-coverage = [
-    {file = "coverage-6.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7b4da9bafad21ea45a714d3ea6f3e1679099e420c8741c74905b92ee9bfa7cc"},
-    {file = "coverage-6.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fde17bc42e0716c94bf19d92e4c9f5a00c5feb401f5bc01101fdf2a8b7cacf60"},
-    {file = "coverage-6.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdbb0d89923c80dbd435b9cf8bba0ff55585a3cdb28cbec65f376c041472c60d"},
-    {file = "coverage-6.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:67f9346aeebea54e845d29b487eb38ec95f2ecf3558a3cffb26ee3f0dcc3e760"},
-    {file = "coverage-6.4.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42c499c14efd858b98c4e03595bf914089b98400d30789511577aa44607a1b74"},
-    {file = "coverage-6.4.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c35cca192ba700979d20ac43024a82b9b32a60da2f983bec6c0f5b84aead635c"},
-    {file = "coverage-6.4.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9cc4f107009bca5a81caef2fca843dbec4215c05e917a59dec0c8db5cff1d2aa"},
-    {file = "coverage-6.4.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5f444627b3664b80d078c05fe6a850dd711beeb90d26731f11d492dcbadb6973"},
-    {file = "coverage-6.4.4-cp310-cp310-win32.whl", hash = "sha256:66e6df3ac4659a435677d8cd40e8eb1ac7219345d27c41145991ee9bf4b806a0"},
-    {file = "coverage-6.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:35ef1f8d8a7a275aa7410d2f2c60fa6443f4a64fae9be671ec0696a68525b875"},
-    {file = "coverage-6.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c1328d0c2f194ffda30a45f11058c02410e679456276bfa0bbe0b0ee87225fac"},
-    {file = "coverage-6.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61b993f3998ee384935ee423c3d40894e93277f12482f6e777642a0141f55782"},
-    {file = "coverage-6.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d5dd4b8e9cd0deb60e6fcc7b0647cbc1da6c33b9e786f9c79721fd303994832f"},
-    {file = "coverage-6.4.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7026f5afe0d1a933685d8f2169d7c2d2e624f6255fb584ca99ccca8c0e966fd7"},
-    {file = "coverage-6.4.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9c7b9b498eb0c0d48b4c2abc0e10c2d78912203f972e0e63e3c9dc21f15abdaa"},
-    {file = "coverage-6.4.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ee2b2fb6eb4ace35805f434e0f6409444e1466a47f620d1d5763a22600f0f892"},
-    {file = "coverage-6.4.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ab066f5ab67059d1f1000b5e1aa8bbd75b6ed1fc0014559aea41a9eb66fc2ce0"},
-    {file = "coverage-6.4.4-cp311-cp311-win32.whl", hash = "sha256:9d6e1f3185cbfd3d91ac77ea065d85d5215d3dfa45b191d14ddfcd952fa53796"},
-    {file = "coverage-6.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:e3d3c4cc38b2882f9a15bafd30aec079582b819bec1b8afdbde8f7797008108a"},
-    {file = "coverage-6.4.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a095aa0a996ea08b10580908e88fbaf81ecf798e923bbe64fb98d1807db3d68a"},
-    {file = "coverage-6.4.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef6f44409ab02e202b31a05dd6666797f9de2aa2b4b3534e9d450e42dea5e817"},
-    {file = "coverage-6.4.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b7101938584d67e6f45f0015b60e24a95bf8dea19836b1709a80342e01b472f"},
-    {file = "coverage-6.4.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a32ec68d721c3d714d9b105c7acf8e0f8a4f4734c811eda75ff3718570b5e3"},
-    {file = "coverage-6.4.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6a864733b22d3081749450466ac80698fe39c91cb6849b2ef8752fd7482011f3"},
-    {file = "coverage-6.4.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:08002f9251f51afdcc5e3adf5d5d66bb490ae893d9e21359b085f0e03390a820"},
-    {file = "coverage-6.4.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a3b2752de32c455f2521a51bd3ffb53c5b3ae92736afde67ce83477f5c1dd928"},
-    {file = "coverage-6.4.4-cp37-cp37m-win32.whl", hash = "sha256:f855b39e4f75abd0dfbcf74a82e84ae3fc260d523fcb3532786bcbbcb158322c"},
-    {file = "coverage-6.4.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ee6ae6bbcac0786807295e9687169fba80cb0617852b2fa118a99667e8e6815d"},
-    {file = "coverage-6.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:564cd0f5b5470094df06fab676c6d77547abfdcb09b6c29c8a97c41ad03b103c"},
-    {file = "coverage-6.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cbbb0e4cd8ddcd5ef47641cfac97d8473ab6b132dd9a46bacb18872828031685"},
-    {file = "coverage-6.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6113e4df2fa73b80f77663445be6d567913fb3b82a86ceb64e44ae0e4b695de1"},
-    {file = "coverage-6.4.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d032bfc562a52318ae05047a6eb801ff31ccee172dc0d2504614e911d8fa83e"},
-    {file = "coverage-6.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e431e305a1f3126477abe9a184624a85308da8edf8486a863601d58419d26ffa"},
-    {file = "coverage-6.4.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cf2afe83a53f77aec067033199797832617890e15bed42f4a1a93ea24794ae3e"},
-    {file = "coverage-6.4.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:783bc7c4ee524039ca13b6d9b4186a67f8e63d91342c713e88c1865a38d0892a"},
-    {file = "coverage-6.4.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ff934ced84054b9018665ca3967fc48e1ac99e811f6cc99ea65978e1d384454b"},
-    {file = "coverage-6.4.4-cp38-cp38-win32.whl", hash = "sha256:e1fabd473566fce2cf18ea41171d92814e4ef1495e04471786cbc943b89a3781"},
-    {file = "coverage-6.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:4179502f210ebed3ccfe2f78bf8e2d59e50b297b598b100d6c6e3341053066a2"},
-    {file = "coverage-6.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:98c0b9e9b572893cdb0a00e66cf961a238f8d870d4e1dc8e679eb8bdc2eb1b86"},
-    {file = "coverage-6.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc600f6ec19b273da1d85817eda339fb46ce9eef3e89f220055d8696e0a06908"},
-    {file = "coverage-6.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a98d6bf6d4ca5c07a600c7b4e0c5350cd483c85c736c522b786be90ea5bac4f"},
-    {file = "coverage-6.4.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01778769097dbd705a24e221f42be885c544bb91251747a8a3efdec6eb4788f2"},
-    {file = "coverage-6.4.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfa0b97eb904255e2ab24166071b27408f1f69c8fbda58e9c0972804851e0558"},
-    {file = "coverage-6.4.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:fcbe3d9a53e013f8ab88734d7e517eb2cd06b7e689bedf22c0eb68db5e4a0a19"},
-    {file = "coverage-6.4.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:15e38d853ee224e92ccc9a851457fb1e1f12d7a5df5ae44544ce7863691c7a0d"},
-    {file = "coverage-6.4.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6913dddee2deff8ab2512639c5168c3e80b3ebb0f818fed22048ee46f735351a"},
-    {file = "coverage-6.4.4-cp39-cp39-win32.whl", hash = "sha256:354df19fefd03b9a13132fa6643527ef7905712109d9c1c1903f2133d3a4e145"},
-    {file = "coverage-6.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:1238b08f3576201ebf41f7c20bf59baa0d05da941b123c6656e42cdb668e9827"},
-    {file = "coverage-6.4.4-pp36.pp37.pp38-none-any.whl", hash = "sha256:f67cf9f406cf0d2f08a3515ce2db5b82625a7257f88aad87904674def6ddaec1"},
-    {file = "coverage-6.4.4.tar.gz", hash = "sha256:e16c45b726acb780e1e6f88b286d3c10b3914ab03438f32117c4aa52d7f30d58"},
-]
+coverage = []
 dcicutils = []
 docker = [
     {file = "docker-4.4.4-py2.py3-none-any.whl", hash = "sha256:f3607d5695be025fa405a12aca2e5df702a57db63790c73b927eb6a94aac60af"},
     {file = "docker-4.4.4.tar.gz", hash = "sha256:d3393c878f575d3a9ca3b94471a3c89a6d960b35feb92f033c0de36cc9d934db"},
 ]
 docutils = []
-elasticsearch = [
-    {file = "elasticsearch-6.8.1-py2.py3-none-any.whl", hash = "sha256:540d633afcc0a32972e4b489c4559c9a96e294850853238f7a18b1cbd267c2ed"},
-    {file = "elasticsearch-6.8.1.tar.gz", hash = "sha256:a8062a00b61bc7babeea028530667583a68ecb1a9f59ab0b22ff7feaf70d3564"},
-]
+elasticsearch = []
 et-xmlfile = [
     {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
     {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
@@ -624,18 +668,16 @@ gitdb = [
     {file = "gitdb-4.0.9-py3-none-any.whl", hash = "sha256:8033ad4e853066ba6ca92050b9df2f89301b8fc8bf7e9324d412a63f8bf1a8fd"},
     {file = "gitdb-4.0.9.tar.gz", hash = "sha256:bac2fd45c0a1c9cf619e63a90d62bdc63892ef92387424b855792a6cabe789aa"},
 ]
-gitpython = [
-    {file = "GitPython-3.1.27-py3-none-any.whl", hash = "sha256:5b68b000463593e05ff2b261acff0ff0972df8ab1b70d3cdbd41b546c8b8fc3d"},
-    {file = "GitPython-3.1.27.tar.gz", hash = "sha256:1c885ce809e8ba2d88a29befeb385fcea06338d3640712b59ca623c220bb5704"},
-]
-idna = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
-]
+gitpython = []
+google-auth = []
+google-auth-oauthlib = []
+gspread = []
+idna = []
 jmespath = [
     {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"},
     {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
 ]
+oauthlib = []
 openpyxl = [
     {file = "openpyxl-3.0.10-py2.py3-none-any.whl", hash = "sha256:0ab6d25d01799f97a9464630abacbb34aafecdcaa0ef3cba6d6b3499867d0355"},
     {file = "openpyxl-3.0.10.tar.gz", hash = "sha256:e47805627aebcf860edb4edf7987b1309c1b3632f3750538ed962bbcc3bd7449"},
@@ -659,6 +701,21 @@ pyasn1 = [
     {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
+pyasn1-modules = [
+    {file = "pyasn1-modules-0.2.8.tar.gz", hash = "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e"},
+    {file = "pyasn1_modules-0.2.8-py2.4.egg", hash = "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199"},
+    {file = "pyasn1_modules-0.2.8-py2.5.egg", hash = "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405"},
+    {file = "pyasn1_modules-0.2.8-py2.6.egg", hash = "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb"},
+    {file = "pyasn1_modules-0.2.8-py2.7.egg", hash = "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8"},
+    {file = "pyasn1_modules-0.2.8-py2.py3-none-any.whl", hash = "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74"},
+    {file = "pyasn1_modules-0.2.8-py3.1.egg", hash = "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d"},
+    {file = "pyasn1_modules-0.2.8-py3.2.egg", hash = "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45"},
+    {file = "pyasn1_modules-0.2.8-py3.3.egg", hash = "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4"},
+    {file = "pyasn1_modules-0.2.8-py3.4.egg", hash = "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811"},
+    {file = "pyasn1_modules-0.2.8-py3.5.egg", hash = "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed"},
+    {file = "pyasn1_modules-0.2.8-py3.6.egg", hash = "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0"},
+    {file = "pyasn1_modules-0.2.8-py3.7.egg", hash = "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd"},
+]
 pytest = []
 pytest-cov = []
 pytest-mock = [
@@ -670,10 +727,7 @@ python-dateutil = [
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
 ]
 python-magic = []
-pytz = [
-    {file = "pytz-2022.2.1-py2.py3-none-any.whl", hash = "sha256:220f481bdafa09c3955dfbdddb7b57780e9a94f5127e35456a48589b9e0c0197"},
-    {file = "pytz-2022.2.1.tar.gz", hash = "sha256:cea221417204f2d1a2aa03ddae3e867921971d0d76f14d87abb4414415bbdcf5"},
-]
+pytz = []
 pywin32 = [
     {file = "pywin32-227-cp27-cp27m-win32.whl", hash = "sha256:371fcc39416d736401f0274dd64c2302728c9e034808e37381b5e1b22be4a6b0"},
     {file = "pywin32-227-cp27-cp27m-win_amd64.whl", hash = "sha256:4cdad3e84191194ea6d0dd1b1b9bdda574ff563177d2adf2b4efec2a244fa116"},
@@ -690,6 +744,10 @@ pywin32 = [
 ]
 pyyaml = []
 requests = []
+requests-oauthlib = [
+    {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
+    {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
+]
 rfc3986 = [
     {file = "rfc3986-1.5.0-py2.py3-none-any.whl", hash = "sha256:a86d6e1f5b1dc238b218b012df0aa79409667bb209e58da56d0b94704e712a97"},
     {file = "rfc3986-1.5.0.tar.gz", hash = "sha256:270aaf10d87d0d4e095063c65bf3ddbc6ee3d0b226328ce21e036f946e421835"},
@@ -727,8 +785,8 @@ webob = [
     {file = "WebOb-1.8.7.tar.gz", hash = "sha256:b64ef5141be559cfade448f044fa45c2260351edcb6a8ef6b7e00c7dcef0c323"},
 ]
 websocket-client = [
-    {file = "websocket-client-1.4.0.tar.gz", hash = "sha256:79d730c9776f4f112f33b10b78c8d209f23b5806d9a783e296b3813fc5add2f1"},
-    {file = "websocket_client-1.4.0-py3-none-any.whl", hash = "sha256:33ad3cf0aef4270b95d10a5a66b670a66be1f5ccf10ce390b3644f9eddfdca9d"},
+    {file = "websocket-client-1.4.1.tar.gz", hash = "sha256:f9611eb65c8241a67fb373bef040b3cf8ad377a9f6546a12b620b6511e8ea9ef"},
+    {file = "websocket_client-1.4.1-py3-none-any.whl", hash = "sha256:398909eb7e261f44b8f4bd474785b6ec5f5b499d4953342fe9755e01ef624090"},
 ]
 webtest = [
     {file = "WebTest-2.0.35-py2.py3-none-any.whl", hash = "sha256:44ddfe99b5eca4cf07675e7222c81dd624d22f9a26035d2b93dc8862dc1153c6"},
diff --git a/pyproject.toml b/pyproject.toml
index 7fbd1e56..1f2a8e3b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "Submit4DN"
-version = "3.1.1"
+version = "3.2.0"
 description = "Utility package for submitting data to the 4DN Data Portal"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"
@@ -16,6 +16,7 @@ python = ">=3.7.0,<3.10"
 python-magic = ">=0.4.12"
 attrs = ">=21.4"
 openpyxl = "^3.0.9"
+gspread = "^5.6.0"
 dcicutils = ">=4.0"
 # awscli is not directly imported but is required for aws cp operation
 awscli = "^1.22.88"

From 0b1737eec693a134aa5e585cbea7e71e009db105 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Fri, 28 Oct 2022 17:56:58 -0400
Subject: [PATCH 07/14] removed unused imports, linting updates, added another
 test

---
 tests/conftest.py            | 14 +++++++++++++-
 tests/test_import_data.py    | 15 +++++++++++----
 wranglertools/import_data.py |  2 --
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 1ff6967e..90ec8dce 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,6 +33,18 @@ def prompt_for_lab_award(self, lab=None, award=None):
         return
 
 
+class MockedGauth(object):
+    def __init__(self):
+        pass
+
+    def open_by_key(self, gsid):
+        wkbk = MockedGoogleWorkBook()
+        sheet2 = MockedGoogleWorkSheet()
+        sheet2.set_title('Sheet2')
+        wkbk.add_sheets([MockedGoogleWorkSheet(), sheet2])
+        return wkbk
+
+
 class MockedGoogleWorkSheet(object):
     ''' very basic mocked object to represent a gsheet sheet'''
     def __init__(self, title='Sheet1', data={}):
@@ -57,7 +69,7 @@ def __init__(self, gsid='1111', sheets=[]):
         self.sheets = sheets
 
     def add_sheets(self, sheets=[]):
-        self.sheets.extend(sheets)
+        self.sheets = sheets
 
 
     def worksheets(self):
diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index 8d605ccf..6920385e 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -5,7 +5,7 @@
 import pytest
 import inspect
 import wranglertools.import_data as imp
-from tests.conftest import MockedGoogleWorkSheet, MockedGoogleWorkBook
+from tests.conftest import MockedGoogleWorkSheet, MockedGoogleWorkBook, MockedGauth
 
 # test data is in conftest.py
 
@@ -126,7 +126,6 @@ def test_reader_wrong_sheetname(capsys):
     assert out == msg
 
 
-
 @pytest.fixture
 def gs_test_data():
     return {'row1': ['a', 'b', 'c'], 'row2': ['d', 'e', 'f']}
@@ -175,12 +174,20 @@ def test_reader_gsheet_bad_name(mock_gsheet, capsys):
 def test_row_generator_gsheet(mock_gsheet, gs_test_data):
     res = imp.row_generator(mock_gsheet, 'gsheet')
     # import pdb; pdb.set_trace()
-    assert(inspect.isgenerator(res))
+    assert inspect.isgenerator(res)
     lres = list(res)
     assert len(lres) == 2
     assert lres[0] == gs_test_data['row1']
 
 
+def test_open_gsheets():
+    test_names = ['Sheet1', 'Sheet2']
+    ret_wkbk, ret_names = imp.open_gsheets('gsid1234', MockedGauth())
+    assert type(ret_wkbk) is MockedGoogleWorkBook
+    for rn in ret_names:
+        assert rn in test_names
+
+
 def test_cell_value(workbooks):
     readxls = imp.reader(workbooks.get('test_cell_values.xlsx'))
     list_readxls = list(readxls)
@@ -977,7 +984,7 @@ def test_cabin_cross_check_remote_w_award_not_for_lab_options(mocker, connection
     mocker.patch('wranglertools.import_data._verify_and_return_item', side_effect=[
         {'awards': ['/awards/test_award/', '/awards/1U54DK107977-01/']}, {'@id': '/awards/non-ren-lab-award/'}
     ])
-    with pytest.raises (SystemExit):
+    with pytest.raises(SystemExit):
         connection_mock.labs = ['test_lab', '/labs/bing-ren-lab']
         imp.cabin_cross_check(connection_mock, False, False, True, '/labs/bing-ren-lab/', '/awards/non-ren-lab-award/')
 
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index a212a4f0..9a68ef43 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -15,8 +15,6 @@
 import warnings  # to suppress openpyxl warnings
 from base64 import b64encode
 from collections import Counter, OrderedDict
-# from contextlib import closing
-from urllib import request as urllib2
 
 import gspread
 # https://github.com/ahupp/python-magic

From 2cf795a9bbad0e87750ebb48c381106d9e617716 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Mon, 31 Oct 2022 18:15:17 -0400
Subject: [PATCH 08/14] made some updates based on Will's feedback - breaking
 out constants; still need to fix a few tests

---
 tests/test_import_data.py       |  58 ----------------
 wranglertools/constants.py      | 104 +++++++++++++++++++++++++++++
 wranglertools/get_field_info.py |  87 ++++++++++--------------
 wranglertools/import_data.py    | 115 ++++++++++----------------------
 4 files changed, 176 insertions(+), 188 deletions(-)
 create mode 100755 wranglertools/constants.py

diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index 6920385e..abb7538e 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -480,64 +480,6 @@ def test_workbook_reader_no_update_no_patchall_existing_item(capsys, mocker, con
     assert out[0] == message
 
 
-# def test_workbook_reader_post_ftp_file_upload(capsys, mocker, connection_mock, workbooks):
-#     test_insert = 'Ftp_file_test_md5.xlsx'
-#     dict_load = {}
-#     dict_rep = {}
-#     dict_set = {}
-#     all_aliases = {}
-#     message1 = "FILECALIBRATION(1)         :  1 posted / 0 not posted       0 patched / 0 not patched, 0 errors\n"
-#     e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]}
-#     # mock fetching existing info, return None
-#     mocker.patch('wranglertools.import_data.get_existing', return_value={})
-#     # mock upload file and skip
-#     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
-#     # mock the ftp copy - this should get it's own tests
-#     mocker.patch('wranglertools.import_data.ftp_copy',
-#                  return_value=(True, {'md5sum': '0f343b0931126a20f133d67c2b018a3b'}, '1KB.zip'))
-#     # mock file deletion
-#     mocker.patch('wranglertools.import_data.pp.Path.unlink')
-#     # mock posting new items
-#     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-#     imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
-#                         all_aliases, dict_load, dict_rep, dict_set, True, [])
-#     args = imp.ff_utils.post_metadata.call_args
-#     out = capsys.readouterr()[0]
-#     post_json_arg = args[0][0]
-#     assert post_json_arg['md5sum'] == '0f343b0931126a20f133d67c2b018a3b'
-#     assert message1 == out
-
-
-# def test_workbook_reader_post_ftp_file_upload_no_md5(capsys, mocker, connection_mock, workbooks):
-#     """ This appears to actually mainly be testing the ftp_copy function - confirming that
-#         the correct error messages are generated when you try to copy an ftp file without
-#         including an md5sum in the post and subsequently that the workbook_reader function
-#         will still post the metadata without uploading a file
-#     """
-#     test_insert = 'Ftp_file_test.xlsx'
-#     dict_load = {}
-#     dict_rep = {}
-#     dict_set = {}
-#     all_aliases = {}
-#     message0 = "WARNING: File not uploaded"
-#     message1 = "Please add original md5 values of the files"
-#     message2 = "FILECALIBRATION(1)         :  1 posted / 0 not posted       0 patched / 0 not patched, 0 errors"
-#     e = {'status': 'success', '@graph': [{'uuid': 'some_uuid', '@id': 'some_uuid'}]}
-#     # mock fetching existing info, return None
-#     mocker.patch('wranglertools.import_data.get_existing', return_value={})
-#     # mock upload file and skip
-#     mocker.patch('wranglertools.import_data.upload_file_item', return_value={})
-#     # mock posting new items
-#     mocker.patch('dcicutils.ff_utils.post_metadata', return_value=e)
-#     imp.workbook_reader(workbooks.get(test_insert), 'excel', 'FileCalibration', True, connection_mock, False,
-#                         all_aliases, dict_load, dict_rep, dict_set, True, [])
-#     out = capsys.readouterr()[0]
-#     outlist = [i.strip() for i in out.split('\n') if i.strip()]
-#     assert message0 == outlist[0]
-#     assert message1 == outlist[1]
-#     assert message2 == outlist[2]
-
-
 @pytest.mark.file_operation
 def test_workbook_reader_update_new_file_fastq_post_and_file_upload(capsys, mocker, connection_mock, workbooks):
     """ This appears to actually mainly be testing the md5 function - confirming that
diff --git a/wranglertools/constants.py b/wranglertools/constants.py
new file mode 100755
index 00000000..459f2b9e
--- /dev/null
+++ b/wranglertools/constants.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -*- coding: latin-1 -*-
+import pathlib as pp
+
+
+''' 2022-10-31 add a .submit4dn directory to store keypairs.json and if used google auth files
+    the home directory will still be automatically checked to support older installations and
+    an enviromental variable will also be queried for
+'''
+HOME = pp.Path.home()
+CONFDIR = HOME.joinpath('.submit4dn')
+DEFAULT_KEYPAIR_FILE = 'keypairs.json'
+ENV_VAR_DIR = 'SUBMIT_4DN_CONF_DIR'
+
+SHEET_ORDER = [
+    "User", "Award", "Lab", "Document", "Protocol", "ExperimentType",
+    "Publication", "Organism", "Vendor", "IndividualChicken", "IndividualFly",
+    "IndividualHuman", "IndividualMouse", "IndividualPrimate",
+    "IndividualZebrafish", "FileFormat", "Enzyme", "GenomicRegion", "Gene",
+    "BioFeature", "Construct", "TreatmentRnai", "TreatmentAgent",
+    "Antibody", "Modification", "Image", "Biosource", "BiosampleCellCulture",
+    "Biosample", "FileFastq", "FileProcessed", "FileReference",
+    "FileCalibration", "FileSet", "FileSetCalibration", "MicroscopeSettingD1",
+    "MicroscopeSettingD2", "MicroscopeSettingA1", "MicroscopeSettingA2",
+    "FileMicroscopy", "FileSetMicroscopeQc", "ImagingPath", "ExperimentMic",
+    "ExperimentMic_Path", "ExperimentHiC", "ExperimentCaptureC",
+    "ExperimentRepliseq", "ExperimentAtacseq", "ExperimentChiapet",
+    "ExperimentDamid", "ExperimentSeq", "ExperimentTsaseq", "ExperimentSet",
+    "ExperimentSetReplicate", "WorkflowRunSbg", "WorkflowRunAwsem",
+    "OntologyTerm"
+]
+
+# list of [sheet, [fields]] that need to be patched as a second step
+# should be in sync with loadxl.py in fourfront
+LIST_OF_LOADXL_FIELDS = [
+    ['Document', ['references']],
+    ['User', ['lab', 'submits_for']],
+    ['ExperimentType', ['sop', 'reference_pubs']],
+    ['Biosample', ['biosample_relation']],
+    ['Experiment', ['experiment_relation']],
+    ['ExperimentMic', ['experiment_relation']],
+    ['ExperimentHiC', ['experiment_relation']],
+    ['ExperimentSeq', ['experiment_relation']],
+    ['ExperimentTsaseq', ['experiment_relation']],
+    ['ExperimentDamid', ['experiment_relation']],
+    ['ExperimentChiapet', ['experiment_relation']],
+    ['ExperimentAtacseq', ['experiment_relation']],
+    ['ExperimentCaptureC', ['experiment_relation']],
+    ['ExperimentRepliseq', ['experiment_relation']],
+    ['FileFastq', ['related_files']],
+    ['FileReference', ['related_files']],
+    ['FileCalibration', ['related_files']],
+    ['FileMicroscopy', ['related_files']],
+    ['FileProcessed', ['related_files', 'produced_from']],
+    ['Individual', ['individual_relation']],
+    ['IndividualChicken', ['individual_relation']],
+    ['IndividualFly', ['individual_relation']],
+    ['IndividualHuman', ['individual_relation']],
+    ['IndividualMouse', ['individual_relation']],
+    ['IndividualPrimate', ['individual_relation']],
+    ['IndividualZebrafish', ['individual_relation']],
+    ['Publication', ['exp_sets_prod_in_pub', 'exp_sets_used_in_pub']]
+]
+
+# these may change are special so adding as explicit constant
+XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+ZIP_MIME = 'application/zip'
+
+ALLOWED_MIMES = (
+    'application/pdf',
+    ZIP_MIME,
+    'text/plain',
+    'text/tab-separated-values',
+    'text/html',
+    'application/msword',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/vnd.ms-excel',
+    XLSX_MIME,
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/tiff',
+)
+
+''' These are the scope of access needed for accessing google sheets
+    Currently only read is supported and needed - this is used in the
+    google oauth athentication workflow.
+    NOTE: if additional access scopes are wanted needed this will need
+    re-approval by google - AJS 2022-10-31'''
+SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
+GCRED_FNAME = 'credentials.json'
+AUTH_TOKEN_FNAME = 'authorized_user.json'
+# pattern to search url for a google sheet ID
+GSHEET_URL_REGEX = "/spreadsheets/d/([A-Za-z0-9_-]+)/*"
+# gets all the characters after /d/ and before the next slash
+# https://docs.google.com/spreadsheets/d/1hy9iilJUfAIbANCkuDZtbOL5wiQ8nKUGRwENK3qlWj4/edit#gid=0
+# https://docs.google.com/spreadsheets/d/1jMY15_7Qmmj5tYPLtDFXj87H-Qy732E0kYtU1S4Ddgs/edit#gid=1689247783
+# and then make sure it only has valid characters
+GSID_REGEX = "^[A-Za-z0-9_-]+$"
+# can only contain alpha-numerics or _ or -
+# 1hy9iilJUfAIbANCkuDZtbOL5wiQ8nKUGRwENK3qlWj4 or 1jMY15_7Qmmj5tYPLtDFXj87H-Qy732E0kYtU1S4Ddgs
+# supported spreadsheet types
+GSHEET = 'gsheet'  # google spreadsheet
+EXCEL = 'excel'  # excel xlsx workbook
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
index 9995478c..34bc5782 100755
--- a/wranglertools/get_field_info.py
+++ b/wranglertools/get_field_info.py
@@ -5,23 +5,17 @@
 from dcicutils import ff_utils
 import attr
 import openpyxl
+import os
 import sys
 import json
 
+from wranglertools.constants import (
+    HOME, CONFDIR, ENV_VAR_DIR, DEFAULT_KEYPAIR_FILE, SHEET_ORDER
+)
+
 
 EPILOG = '''
     To create an excel workbook file with sheets to be filled use the examples below and modify to your needs.
-    It will accept the following optional parameters.
-        --keyfile        the path to the file where you have stored your access key info (default ~/keypairs.json)
-        --key            the name of the key identifier for the access key and secret in your keys file (default=default)
-        --type           use for each sheet that you want to add to the excel workbook
-        --nodesc         do not add the descriptions in the second line (by default they are added)
-        --noenums        do not add the list of options for a field if they are specified (by default they are added)
-        --comments       adds any (usually internal) comments together with enums (by default False)
-        --outfile        change the default file name "fields.xlsx" to a specified one
-        --debug          to add more debugging output
-        --noadmin        if you have admin access to 4DN this option lets you generate the sheet as a non-admin user
-
 
     This program graphs uploadable fields (i.e. not calculated properties)
     for a type with optionally included description and enum values.
@@ -54,15 +48,14 @@ def _remove_all_from_types(args):
 
 
 def create_common_arg_parser():
-    home = pp.Path.home()
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument('--key',
                         default='default',
                         help="The keypair identifier from the keyfile.  \
                         Default is --key=default")
     parser.add_argument('--keyfile',
-                        default=home / 'keypairs.json',
-                        help=f"The keypair file.  Default is --keyfile={home / 'keypairs.json'}")
+                        default=CONFDIR / 'keypairs.json',
+                        help=f"The keypair file.  Default is --keyfile={CONFDIR / DEFAULT_KEYPAIR_FILE}")
     parser.add_argument('--debug',
                         default=False,
                         action='store_true',
@@ -109,21 +102,33 @@ def getArgs():  # pragma: no cover
 class FDN_Key:
     def __init__(self, keyfile, keyname):
         self.error = False
+        keys = None
+        envdir = os.environ.get(ENV_VAR_DIR)
         # is the keyfile a dictionary
         if isinstance(keyfile, dict):
             keys = keyfile
-        # is the keyfile a file (the expected case)
-        elif pp.Path(str(keyfile)).is_file():
-            keys_f = open(keyfile, 'r')
-            keys_json_string = keys_f.read()
-            keys_f.close()
-            keys = json.loads(keys_json_string)
-        # if both fail, the file does not exist
         else:
-            print("\nThe keyfile does not exist, check the --keyfile path or add 'keypairs.json' to your home folder\n")
-            self.error = True
-            return
-        self.con_key = keys[keyname]
+            if envdir:  # loc of keypairs.json specified in env var
+                keyfile = envdir
+            else:  # check if file
+                fpath = pp.Path(str(keyfile))
+                if not fpath.is_file():
+                    # maybe it's stored in the old default home dir
+                    fpath = HOME.joinpath(DEFAULT_KEYPAIR_FILE)
+                    if not fpath.is_file():
+                        print("\nThe keyfile does not exist\n"
+                              f"check the --keyfile path or add 'keypairs.json' to {CONFDIR}\n")
+                        self.error = True
+                        return
+
+            with open(fpath, 'r') as keys_f:
+                keys_json_string = keys_f.read()
+                keys = json.loads(keys_json_string)
+        try:
+            self.con_key = keys[keyname]
+        except KeyError:
+            print(f"ERROR: No key with {keyname} found - check your keypairs file")
+            sys.exit(1)
         if not self.con_key['server'].endswith("/"):
             self.con_key['server'] += "/"
 
@@ -144,7 +149,7 @@ def __init__(self, key4dn):
             self.email = me_page['email']
             self.check = True
             self.admin = True if 'admin' in me_page.get('groups', []) else False
-        except:
+        except Exception:
             print('Can not establish connection, please check your keys')
             me_page = {}
         if not me_page:
@@ -238,27 +243,9 @@ class FieldInfo(object):
                     ]
 
 
-sheet_order = [
-    "User", "Award", "Lab", "Document", "Protocol", "ExperimentType",
-    "Publication", "Organism", "Vendor", "IndividualChicken", "IndividualFly",
-    "IndividualHuman", "IndividualMouse", "IndividualPrimate",
-    "IndividualZebrafish", "FileFormat", "Enzyme", "GenomicRegion", "Gene",
-    "BioFeature", "Construct", "TreatmentRnai", "TreatmentAgent",
-    "Antibody", "Modification", "Image", "Biosource", "BiosampleCellCulture",
-    "Biosample", "FileFastq", "FileProcessed", "FileReference",
-    "FileCalibration", "FileSet", "FileSetCalibration", "MicroscopeSettingD1",
-    "MicroscopeSettingD2", "MicroscopeSettingA1", "MicroscopeSettingA2",
-    "FileMicroscopy", "FileSetMicroscopeQc", "ImagingPath", "ExperimentMic",
-    "ExperimentMic_Path", "ExperimentHiC", "ExperimentCaptureC",
-    "ExperimentRepliseq", "ExperimentAtacseq", "ExperimentChiapet",
-    "ExperimentDamid", "ExperimentSeq", "ExperimentTsaseq", "ExperimentSet",
-    "ExperimentSetReplicate", "WorkflowRunSbg", "WorkflowRunAwsem",
-    "OntologyTerm"
-]
-
-file_types = [i for i in sheet_order if i.startswith('File') and not i.startswith('FileSet')]
+file_types = [i for i in SHEET_ORDER if i.startswith('File') and not i.startswith('FileSet')]
 file_types.remove('FileFormat')
-exp_types = [i for i in sheet_order if i.startswith('Experiment') and 'Type' not in i and 'Set' not in i]
+exp_types = [i for i in SHEET_ORDER if i.startswith('Experiment') and 'Type' not in i and 'Set' not in i]
 
 
 def get_field_type(field):
@@ -283,7 +270,7 @@ def is_subobject(field):
         return True
     try:
         return field['items']['type'] == 'object'
-    except:
+    except Exception:
         return False
 
 
@@ -400,7 +387,7 @@ def create_excel(all_fields, filename):
     wb = openpyxl.Workbook()
     wb.remove(wb.active)  # removes the by default created empty sheet named Sheet
     # order sheets
-    sheet_list = [(sheet, all_fields[sheet]) for sheet in sheet_order if sheet in all_fields.keys()]
+    sheet_list = [(sheet, all_fields[sheet]) for sheet in SHEET_ORDER if sheet in all_fields.keys()]
     for obj_name, fields in sheet_list:
         ws = wb.create_sheet(title=obj_name)
         ws.cell(row=1, column=1, value="#Field Name:")
@@ -431,7 +418,7 @@ def get_sheet_names(types_list):
     lowercase_types = [item.lower().replace('-', '').replace('_', '') for item in types_list if
                        item != 'ExperimentMic_Path']
     if lowercase_types == ['all']:
-        sheets = [sheet for sheet in sheet_order if sheet not in ['ExperimentMic_Path', 'OntologyTerm']]
+        sheets = [sheet for sheet in SHEET_ORDER if sheet not in ['ExperimentMic_Path', 'OntologyTerm']]
     else:
         presets = {
             'hic': ["image", "filefastq", "experimenthic"],
@@ -457,7 +444,7 @@ def get_sheet_names(types_list):
                     'protocol', 'publication', 'biosource', 'biosample',
                     'biosamplecellculture', 'image', 'experimentsetreplicate'
                 ]
-        sheets = [sheet for sheet in sheet_order if sheet.lower() in lowercase_types]
+        sheets = [sheet for sheet in SHEET_ORDER if sheet.lower() in lowercase_types]
         for name in types_list:
             modified_name = name.lower().replace('-', '').replace('_', '')
             if modified_name in lowercase_types and modified_name not in [sheetname.lower() for sheetname in sheets]:
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index 9a68ef43..73a6bc29 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -25,10 +25,14 @@
 from dcicutils import ff_utils
 from gspread.exceptions import GSpreadException
 from openpyxl.utils.exceptions import InvalidFileException
+from wranglertools.constants import (
+    CONFDIR, GSID_REGEX, SHEET_ORDER, LIST_OF_LOADXL_FIELDS, ENV_VAR_DIR, GCRED_FNAME,
+    AUTH_TOKEN_FNAME, SCOPES, GSHEET, EXCEL, ZIP_MIME, XLSX_MIME, ALLOWED_MIMES,
+    GSHEET_URL_REGEX, GSID_REGEX
+)
 from wranglertools.get_field_info import (FDN_Connection, FDN_Key,
                                           _remove_all_from_types,
-                                          create_common_arg_parser,
-                                          sheet_order)
+                                          create_common_arg_parser)
 
 
 def getArgs():  # pragma: no cover
@@ -107,59 +111,6 @@ def getArgs():  # pragma: no cover
 '''
 
 
-# list of [sheet, [fields]] that need to be patched as a second step
-# should be in sync with loadxl.py in fourfront
-list_of_loadxl_fields = [
-    ['Document', ['references']],
-    ['User', ['lab', 'submits_for']],
-    ['ExperimentType', ['sop', 'reference_pubs']],
-    ['Biosample', ['biosample_relation']],
-    ['Experiment', ['experiment_relation']],
-    ['ExperimentMic', ['experiment_relation']],
-    ['ExperimentHiC', ['experiment_relation']],
-    ['ExperimentSeq', ['experiment_relation']],
-    ['ExperimentTsaseq', ['experiment_relation']],
-    ['ExperimentDamid', ['experiment_relation']],
-    ['ExperimentChiapet', ['experiment_relation']],
-    ['ExperimentAtacseq', ['experiment_relation']],
-    ['ExperimentCaptureC', ['experiment_relation']],
-    ['ExperimentRepliseq', ['experiment_relation']],
-    ['FileFastq', ['related_files']],
-    ['FileReference', ['related_files']],
-    ['FileCalibration', ['related_files']],
-    ['FileMicroscopy', ['related_files']],
-    ['FileProcessed', ['related_files', 'produced_from']],
-    ['Individual', ['individual_relation']],
-    ['IndividualChicken', ['individual_relation']],
-    ['IndividualFly', ['individual_relation']],
-    ['IndividualHuman', ['individual_relation']],
-    ['IndividualMouse', ['individual_relation']],
-    ['IndividualPrimate', ['individual_relation']],
-    ['IndividualZebrafish', ['individual_relation']],
-    ['Publication', ['exp_sets_prod_in_pub', 'exp_sets_used_in_pub']]
-]
-
-
-ALLOWED_MIMES = (
-    'application/pdf',
-    'application/zip',
-    'text/plain',
-    'text/tab-separated-values',
-    'text/html',
-    'application/msword',
-    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-    'application/vnd.ms-excel',
-    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-    'image/png',
-    'image/jpeg',
-    'image/gif',
-    'image/tiff',
-)
-
-# If modifying these scopes, delete the file token.json.
-SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
-
-
 def md5(path_string):
     path = pp.Path(path_string).expanduser()
     md5sum = hashlib.md5()
@@ -178,12 +129,17 @@ class WebFetchException(Exception):
 
 def google_authenticate():
     gsauth = None
-    creddir = pp.Path(__file__).parent.joinpath('.config', 'gspread')
-    gsauth = gspread.oauth(
-        credentials_filename=creddir.joinpath('credentials.json'),
-        authorized_user_filename=creddir.joinpath('authorized_user.json'),
-        scopes=SCOPES
-    )
+    ga_cred_env = os.environ.get(ENV_VAR_DIR)  # look to see if set as env variable
+    # default to .submit4dn dir in home dir
+    creddir = pp.Path(ga_cred_env) if ga_cred_env else CONFDIR
+    try:
+        gsauth = gspread.oauth(
+            credentials_filename=creddir.joinpath(GCRED_FNAME),
+            authorized_user_filename=creddir.joinpath(AUTH_TOKEN_FNAME),
+            scopes=SCOPES
+        )
+    except GSpreadException as gse:
+        raise f"GOOGLE AUTH PROBLEM: {gse}"
     return gsauth
 
 
@@ -197,7 +153,7 @@ def mime_allowed(path, ok_mimes):
     # NOTE: this whole guesssing and detecting bit falls apart for zip files which seems a bit dodgy
     # some .zip files are detected as generic application/octet-stream but don't see a good way to verify
     # basically relying on extension with a little verification by magic for most file types
-    if detected_mime != guessed_mime and guessed_mime != 'application/zip':
+    if detected_mime != guessed_mime and guessed_mime != ZIP_MIME:
         print('Wrong extension for %s: %s' % (detected_mime, filename))
         return False
     return guessed_mime
@@ -240,7 +196,7 @@ def attachment(path):
     # basically relying on extension with a little verification by magic for most file types
     if guessed_mime not in ALLOWED_MIMES:
         raise ValueError("Unallowed file type for %s" % filename)
-    if detected_mime != guessed_mime and guessed_mime != 'application/zip':
+    if detected_mime != guessed_mime and guessed_mime != ZIP_MIME:
         raise ValueError('Wrong extension for %s: %s' % (detected_mime, filename))
 
     with open(path, 'rb') as stream:
@@ -276,11 +232,11 @@ def open_gsheets(gsid, gauth):
 
 
 def get_workbook(inputname, booktype, gauth=None):
-    if booktype == 'excel':
+    if booktype == EXCEL:
         return digest_xlsx(inputname)
-    elif booktype == 'gsheet':
+    elif booktype == GSHEET:
         if not gauth:
-            raise GSpreadException("Google authentication problem")
+            raise Exception("ERROR: Trying to submit with Google sheets but no authentication found")
         return open_gsheets(inputname, gauth)
 
 
@@ -288,7 +244,7 @@ def reader(workbook, sheetname=None, booktype=None):
     """Read named sheet or first and only sheet from xlsx or google sheets file.
         Assume excel by default - will choke if no booktype and not excel"""
     sheet = None
-    if not booktype or booktype == 'excel':
+    if not booktype or booktype == EXCEL:
         if sheetname is None:
             sheet = workbook.worksheets[0]
         else:
@@ -299,7 +255,7 @@ def reader(workbook, sheetname=None, booktype=None):
                 print(sheetname)
                 print("ERROR: Can not find the collection sheet in excel file (openpyxl error)")
                 return
-    elif booktype == 'gsheet':
+    elif booktype == GSHEET:
         if sheetname is None:
             sheet = workbook.get_worksheet(0)
         else:
@@ -777,9 +733,9 @@ def filter_set_from_exps(post_json):
 
 
 def filter_loadxl_fields(post_json, sheet):
-    """All fields from the list_of_loadxl_fields are taken out of post_json and accumulated in dictionary."""
+    """All fields from the LIST_OF_LOADXL_FIELDS are taken out of post_json and accumulated in dictionary."""
     patch_loadxl_item = {}
-    for sheet_loadxl, fields_loadxl in list_of_loadxl_fields:
+    for sheet_loadxl, fields_loadxl in LIST_OF_LOADXL_FIELDS:
         if sheet == sheet_loadxl:
             for field_loadxl in fields_loadxl:
                 if post_json.get(field_loadxl):
@@ -1506,7 +1462,7 @@ def running_on_windows_native():
 # used to avoid dependencies... i.e. biosample needs the biosource to exist
 def order_sorter(list_of_names):
     ret_list = []
-    for i in sheet_order:
+    for i in SHEET_ORDER:
         if i in list_of_names:
             ret_list.append(i)
     # we add the list of user supplied workflows at the end
@@ -1516,7 +1472,7 @@ def order_sorter(list_of_names):
     if list(set(list_of_names)-set(ret_list)) != []:
         missing_items = ", ".join(list(set(list_of_names)-set(ret_list)))
         print("WARNING!", missing_items, "sheet(s) are not loaded")
-        print("WARNING! Check the sheet names and the reference list \"sheet_order\"")
+        print("WARNING! Check the sheet names and the list in constant \"SHEET_ORDER\"")
     return ret_list
 
 
@@ -1552,22 +1508,21 @@ def _verify_and_return_item(item, connection):
 
 def check_and_return_input_type(inputname):
     if pp.Path(inputname).is_file():
-        xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
         # specific check for xlsx
-        if not mime_allowed(inputname, xlsx_mime):
+        if not mime_allowed(inputname, XLSX_MIME):
             print(f"ERROR: File {inputname} not recognized as excel file")
             sys.exit(1)
-        return inputname, 'excel'
-    elif inputname.startswith('http'):
+        return inputname, EXCEL
+    elif inputname.startswith('https'):
         # assume a url to google sheet and look for google?
         if 'google' not in inputname:
             print("ERROR: URL provided does not appear to be google sheet url")
             sys.exit(1)
         # parse out the bookId
-        inputname = re.search("/d/([A-Za-z0-9_-]+)/*", inputname).group(1)
-    if not re.match("^[A-Za-z0-9_-]+$", inputname):
+        inputname = re.search(GSHEET_URL_REGEX, inputname).group(1)
+    if not re.match(GSID_REGEX, inputname):
         print("ERROR: invalid format of the google sheet ID in input - {}".format(inputname))
-    return inputname, 'gsheet'
+    return inputname, GSHEET
 
 
 def cabin_cross_check(connection, patchall, update, remote, lab=None, award=None):
@@ -1705,7 +1660,7 @@ def main():  # pragma: no cover
 
     # need to google authenticate to allow gsheet to be read
     gauth = None
-    if booktype == 'gsheet':
+    if booktype == GSHEET:
         gauth = google_authenticate()
 
     workbook, sheetnames = get_workbook(inputname, booktype, gauth)

From 5e7de2d24f2c12d69949362ab9b49de4d7099193 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Tue, 1 Nov 2022 18:10:15 -0400
Subject: [PATCH 09/14] updating tests and connection functions

---
 tests/conftest.py               | 47 ++++++++++++++++
 tests/test_get_field_info.py    | 96 +++++++++++++++++++++++++++++----
 tests/test_import_data.py       |  5 +-
 wranglertools/get_field_info.py | 93 ++++++++++++++++++++++----------
 wranglertools/import_data.py    |  2 +-
 5 files changed, 202 insertions(+), 41 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 90ec8dce..771b88f4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,7 @@
 # flake8: noqa
 import pytest
 from wranglertools.get_field_info import FDN_Key, FDN_Connection
+from wranglertools.constants import CONFDIR, DEFAULT_KEYPAIR_FILE
 from pathlib import Path
 import openpyxl
 
@@ -86,6 +87,40 @@ def worksheet(self, title):
         raise Exception
 
 
+class MockedNamespace(object):
+    def __init__(self, dic):
+        for k, v in dic.items():
+            setattr(self, k, v)
+
+
+@pytest.fixture
+def mocked_args_w_type():
+    return MockedNamespace(
+        {
+            'type': ['FileFastq', 'all'],
+            'key': 'default'
+        }
+    )
+
+
+@pytest.fixture
+def mocked_gfi_args_default():
+    return MockedNamespace(
+        {
+            'type': ['all'],
+            'key': 'default',
+            'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE, 
+            'debug': False,
+            'nodesc': False,
+            'comments': False,
+            'noenums': False,
+            'outfile': 'fields.xlsx',
+            'noadmin': False
+        }
+    )
+
+
+
 @pytest.fixture
 def connection_mock():
     keypairs = {
@@ -506,6 +541,18 @@ def returned_experiment_set_schema():
     data = {"title":"Experiment set","description":"Schema for submitting metadata for an experiment set.","id":"/profiles/experiment_set.json","$schema":"http://json-schema.org/draft-04/schema#","type":"object","required":["award","lab"],"identifyingProperties":["uuid","aliases"],"additionalProperties":False,"mixinProperties":[{"$ref":"mixins.json#/schema_version"},{"$ref":"mixins.json#/accession"},{"$ref":"mixins.json#/uuid"},{"$ref":"mixins.json#/aliases"},{"$ref":"mixins.json#/status"},{"$ref":"mixins.json#/attribution"},{"$ref":"mixins.json#/submitted"},{"$ref":"mixins.json#/notes"},{"$ref":"mixins.json#/documents"}],"properties":{"documents":{"type":"array","title":"Documents","items":{"type":"string","linkTo":"Document","comment":"See document.json for available identifiers.","title":"Document","description":"A document that provides additional information (not data file)."},"default":[],"description":"Documents that provide additional information (not data file).","uniqueItems":True},"notes":{"type":"string","elasticsearch_mapping_index_type":{"type":"string","default":"analyzed","enum":["analyzed","not_analyzed","no"],"title":"Field mapping index type","description":"Defines one of three types of indexing available"},"title":"Notes","description":"DCIC internal notes."},"submitted_by":{"type":"string","linkTo":"User","comment":"Do not submit, value is assigned by the server. The user that created the object.","title":"Submitted by","rdfs:subPropertyOf":"dc:creator","readonly":True,"serverDefault":"userid","permission":"import_items"},"date_created":{"type":"string","serverDefault":"now","anyOf":[{"format":"date-time"},{"format":"date"}],"comment":"Do not submit, value is assigned by the server. The date the object is created.","title":"Date created","rdfs:subPropertyOf":"dc:created","readonly":True,"permission":"import_items"},"lab":{"type":"string","linkTo":"Lab","comment":"See lab.json for list of available identifiers.","title":"Lab","description":"Lab associated with the submission.","linkSubmitsFor":True},"award":{"type":"string","linkTo":"Award","comment":"See award.json for list of available identifiers.","title":"Grant","description":"Grant associated with the submission."},"status":{"type":"string","readonly":True,"title":"Status","enum":["released","current","revoked","deleted","replaced","in review by lab","in review by project","released to project"],"default":"in review by lab","permission":"import_items"},"aliases":{"type":"array","title":"Lab aliases","items":{"type":"string","pattern":"^\\S+:\\S+","comment":"Current convention is colon separated lab name and lab identifier. (e.g. john-doe:42).","title":"Lab alias","description":"A lab specific identifier to reference an object.","uniqueKey":"alias"},"default":[],"description":"Lab specific identifiers to reference an object.","uniqueItems":True},"uuid":{"type":"string","readonly":True,"title":"UUID","serverDefault":"uuid4","requestMethod":"POST","permission":"import_items","format":"uuid"},"accession":{"type":"string","accessionType":"ES","readonly":True,"title":"Accession","description":"A unique identifier to be used to reference the object.","serverDefault":"accession","permission":"import_items","comment":"Only admins are allowed to set or update this value.","format":"accession"},"alternate_accessions":{"type":"array","default":[],"description":"Accessions previously assigned to objects that have been merged with this object.","title":"Alternate accessions","items":{"type":"string","comment":"Only admins are allowed to set or update this value.","title":"Alternate Accession","description":"An accession previously assigned to an object that has been merged with this object.","permission":"import_items","format":"accession"}},"schema_version":{"type":"string","pattern":"^\\d+(\\.\\d+)*$","hidden comment":"Bump the default in the subclasses.","comment":"Do not submit, value is assigned by the server. The version of the JSON schema that the server uses to validate the object. Schema version indicates generation of schema used to save version to to enable upgrade steps to work. Individual schemas should set the default.","title":"Schema Version","requestMethod":[]},"experiments_in_set":{"type":"array","title":"Set of experiments","exclude_from":["submit4dn"],"default":[],"description":"List of experiments to be associatedas a set.","uniqueItems":True,"items":{"title":"Experiment","comment":"use accessions for identifiers.","type":"string","linkTo":"Experiment"}},"experimentset_type":{"type":"string","enum":["custom"],"title":"Experiment Set type","description":"The categorization of the set of experiments."},"description":{"type":"string","default":"","title":"Description","description":"A description of why experiments are part of the set."},"@type":{"type":"array","calculatedProperty":True,"title":"Type","items":{"type":"string"}},"@id":{"type":"string","calculatedProperty":True,"title":"ID"}},"facets":{"experimentset_type":{"title":"Experiment set type"},"experiments_in_set.award.project":{"title":"Project"},"experiments_in_set.biosample.biosource.individual.organism.name":{"title":"Organism"},"experiments_in_set.biosample.biosource.biosource_type":{"title":"Biosource type"},"experiments_in_set.biosample.biosource_summary":{"title":"Biosource"},"experiments_in_set.digestion_enzyme.name":{"title":"Enzyme"},"experiments_in_set.biosample.modifications_summary":{"title":"Modifications"},"experiments_in_set.biosample.treatments_summary":{"title":"Treatments"},"experiments_in_set.lab.title":{"title":"Lab"}},"columns":{"accession":{"title":"Accession"},"experimentset_type":{"title":"Experiment set type"},"description":{"title":"Description"},"experiments_in_set":{"title":"Experiments"}},"@type":["JSONSchema"]}
     return MockedResponse(data, 200)
 
+@pytest.fixture
+def returned_experiment_hi_c_schema():
+    data = {"title": "Hi-C Experiment","description": "Genome-wide chromosome conformation capture experiments including Hi-C, micro-C, DNase Hi-C","id": "/profiles/experiment_hi_c.json","$schema": "http://json-schema.org/draft-04/schema#","type": "object","required": ["experiment_type", "award", "lab", "biosample"],"identifyingProperties": ["uuid", "accession", "aliases"],"additionalProperties": False,"mixinProperties": [{ "$ref": "mixins.json#/schema_version" },{ "$ref": "mixins.json#/accession" },{ "$ref": "mixins.json#/uuid" },{ "$ref": "mixins.json#/aliases" },{ "$ref": "mixins.json#/attribution" },{ "$ref": "mixins.json#/submitted" },{ "$ref": "mixins.json#/modified" },{ "$ref": "mixins.json#/release_dates" },{ "$ref": "mixins.json#/notes" },{ "$ref": "mixins.json#/references" },{ "$ref": "mixins.json#/dbxrefs" },{ "$ref": "mixins.json#/external_submission" },{ "$ref": "mixins.json#/documents" },{ "$ref": "mixins.json#/library"},{ "$ref": "mixins.json#/sop_mapping"},{ "$ref": "mixins.json#/tags" },{ "$ref": "mixins.json#/badges" },{ "$ref": "mixins.json#/supplementary_files" },{ "$ref": "mixins.json#/static_embeds" },{ "$ref": "experiment.json#/properties"}],"mixinFacets": [{ "$ref": "experiment.json#/facets"},{ "$ref": "mixins.json#/facets_common" },{ "$ref": "mixins.json#/facets_aggregated_badges"}],"mixinColumns": [{ "$ref": "experiment.json#/columns"}],"dependencies": {"crosslinking_temperature": ["crosslinking_method", "crosslinking_time"],"crosslinking_time": ["crosslinking_method", "crosslinking_temperature"],"digestion_temperature": ["digestion_enzyme", "digestion_time"],"digestion_time": ["digestion_enzyme", "digestion_temperature"]},"properties": {"schema_version": {"default": "2"},"experiment_type": {"title": "Experiment Type","type": "string","lookup": 10,"description": "A controlled term specifying the type of experiment.","linkTo": "ExperimentType","ff_flag": "filter:valid_item_types"},"crosslinking_method": {"title": "Crosslinking Method","description": "Term used for the method for crosslinking chromatin","type": "string","lookup": 100,"suggested_enum": ["none","1% Formaldehyde","1.3% Formaldehyde","2% Formaldehyde","2.5% Formaldehyde","3% Formaldehyde","3.5% Formaldehyde","1% Formaldehyde and 3mM DSG","1% Formaldehyde and 2mM EGS","1% Formaldehyde and 3mM EGS","2% Formaldehyde and 2mM EGS"]},"crosslinking_time": {"title": "Crosslinking Time (min)","description": "Time of crosslinking step in minutes","type": "number","lookup": 101},"crosslinking_temperature": {"title": "Crosslinking Temperature (°C)","description": "Temperature of crosslinking step in degrees Celsius","type": "number","lookup": 102},"digestion_enzyme": {"title": "Digestion Enzyme","description": "The enzyme used for digestion of the DNA.","comment": "See Enzymes sheet or collection for existing items.","type": "string","lookup": 110,"linkTo": "Enzyme"},"enzyme_lot_number": {"title": "Digestion Enzyme Lot Number","description": "Lot number of batch of enzyme used to digest DNA","type": "string","lookup": 111},"digestion_time": {"title": "Digestion Time (min)","description": "Time of digestion step in minutes","type": "number","lookup": 112},"digestion_temperature": {"title": "Digestion Temperature (°C)","description": "Temperature of digestion step in degrees Celsius","type": "number","lookup": 113},"tagging_method": {"title": "Tagging Method","description": "Information on the biotinylated base used or other tagging info","type": "string","lookup": 120,"internal_comment": "should this be a controlled CV?"},"ligation_time": {"title": "Ligation Time (min)","description": "Time of ligation step in minutes","type": "number","lookup": 130},"ligation_temperature": {"title": "Ligation Temperature (°C)","description": "Temperature of ligation step in degrees Celsius","type": "number","lookup": 131},"ligation_volume": {"title": "Ligation Volume (ml)","description": "Volume of ligation step in milliliters","type": "number","lookup": 132},"biotin_removed": {"title": "Biotin Removal Step","description": "The optional biotin removal step was performed","type": "string","lookup": 140,"enum": ["Yes", "No"]}},"columns": {}}
+    return MockedResponse(data, 200)
+
+
+@pytest.fixture
+def returned_file_fastq_schema():
+    data = {"title": "FASTQ file","description": "Raw DNA sequncing file details and file in fastq.gz format.","id": "/profiles/file_fastq.json","$schema": "http://json-schema.org/draft-04/schema#","type": "object","required": ["file_format", "award", "lab"],"identifyingProperties": ["uuid", "accession", "aliases"],"additionalProperties": False,"mixinProperties": [{ "$ref": "mixins.json#/schema_version" },{ "$ref": "mixins.json#/uuid" },{ "$ref": "mixins.json#/submitted" },{ "$ref": "mixins.json#/modified" },{ "$ref": "mixins.json#/release_dates" },{ "$ref": "mixins.json#/aliases" },{ "$ref": "mixins.json#/attribution" },{ "$ref": "mixins.json#/notes" },{ "$ref": "mixins.json#/accession" },{ "$ref": "mixins.json#/dbxrefs" },{ "$ref": "mixins.json#/external_submission" },{ "$ref": "mixins.json#/tags" },{ "$ref": "mixins.json#/badges" },{ "$ref": "mixins.json#/static_embeds" },{ "$ref": "file.json#/properties" }],"mixinFacets": [{ "$ref": "file.json#/facets"},{ "$ref": "mixins.json#/facets_common" },{ "$ref": "mixins.json#/facets_aggregated_badges"}],"mixinColumns": [{ "$ref": "file.json#/columns"}],"properties": {"schema_version": {"default": "2"},"file_format": {"title": "File Format","type": "string","linkTo": "FileFormat","lookup": 20,"ff_flag": "filter:valid_item_types"},"file_type": {"title": "File Type","description": "The type of file based on the information in the file.","default": "reads","exclude_from": ["submit4dn", "FFedit-create"],"enum": ["reads","genomic reads","iPCR reads","cDNA reads","squiggles","barcode reads","index reads"]},"file_classification": {"title": "General Classification","type": "string","default": "raw file","exclude_from": ["submit4dn", "FFedit-create"],"enum": ["raw file"]},"extra_files": {"title": "Extra Files","description": "Links to extra files on s3 that don't have associated metadata","type": "array","exclude_from": ["FFedit-create"],"items": {"title": "Extra File","type": "object","required": ["file_format"],"additionalProperties": True,"properties": {"file_format": {"title": "File Format","type": "string","linkTo": "FileFormat","lookup": 400},"href": {"title": "Download URL","type": "string","exclude_from": ["submit4dn", "FFedit-create"]},"md5sum": {"title": "MD5sum","description": "The md5sum of the extra file.","type": "string","exclude_from": ["submit4dn", "FFedit-create"],"ff_flag":"clear edit","format": "hex"},"file_size": {"title": "File Size","exclude_from": ["submit4dn", "FFedit-create"],"description": "Size of file of the extra file.","comment": "","type": "integer"},"status": {"title": "Status","type": "string","exclude_from": ["submit4dn"],"default": "uploading","enum" : ["uploading","uploaded","upload failed","deleted","replaced","revoked","archived","pre-release","released","released to project","archived to project","to be uploaded by workflow"]},"use_for": {"title": "Use for","description": "The use of the extra file.","type": "string","enum": ["visualization"]}}}},"read_length": {"title": "Sequencing Read Length (bp)","description": "Length of sequencing reads in base pairs for fastq files","type": "integer","lookup": 40},"instrument": {"title": "Sequencer","description": "Instrument used for sequencing","type": "string","lookup": 50,"internal_comment": "should this be a controlled CV with enum, or another object?"},"paired_end": {"title": "Paired End Identifier","description": "Which pair the file belongs to (if paired end library)","type": "string","lookup": 30,"enum": ["1","2"]},"flowcell_details": {"title": "Flowcells","description": "For high-throughput sequencing, the flowcells used for the sequencing of the replicate.","type": "array","items": {"title": "Flowcell details","type": "object","required": ["machine"],"additionalProperties": True,"properties": {"machine": {"title": "Machine Name","description": "The lab specific name of the machine used.","type": "string","lookup": 61},"flowcell": {"title": "Flowcell ID","type": "string","lookup": 62},"lane": {"title": "Lane","type": "string","lookup": 63},"barcode": {"title": "Barcode","type": "string","lookup": 64},"barcode_in_read": {"title": "Barcode in Read","description": "The read the barcode is located on.","type": "string","lookup": 65,"enum": ["1","2"]},"barcode_position": {"title": "Barcode Position","description": "The 1-based start position of the barcode in 5->3 orientation.","type": "integer","lookup": 66},"chunk": {"title": "Chunk","description": "The file chunk label as assigned by Illumina software when splitting up a fastq into specified chunk sizes.","comment": "This label is used to re-assemble the chunks into the original file in the correct order.","type": "string","lookup": 67}}}},"beta_actin_sense_count":{"title": "Beta-actin count in the sense strand","description": "Number of reads that match a 21kmer of the Beta-actin encoding gene in the sense strand (RNA-seq experiments)","type": "integer","exclude_from": ["submit4dn", "FFedit-create"],"permission": "import_items","lookup": 1000},"beta_actin_antisense_count":{"title": "Beta-actin count in the anti-sense strand","description": "Number of reads that match a 21kmer of the Beta-actin encoding gene in the anti-sense strand (RNA-seq experiments)","type": "integer","exclude_from": ["submit4dn", "FFedit-create"],"permission": "import_items","lookup": 1001},"file_first_line":{"title": "First line of the fastq file","description": "First line of the fastq file","type": "string","exclude_from": ["submit4dn", "FFedit-create"],"permission": "import_items","lookup": 1002}}}
+    return MockedResponse(data, 200)
+
+
 @pytest.fixture
 def returned_vendor_items():
     data = {'@id': '/search/?type=Vendor&limit=all&frame=object', 'sort': {'label': {'order': 'asc', 'missing': '_last', 'ignore_unmapped': True}, 'date_created': {'order': 'desc', 'ignore_unmapped': True}}, 'columns': {'@id': 'ID', 'aliases': 'Lab aliases', 'name': 'name', 'description': 'Description', 'title': 'Name'}, 'clear_filters': '/search/?type=Vendor', '@context': '/terms/', 'views': [{'href': '/report/?type=Vendor&limit=all&frame=object', 'title': 'View tabular report', 'icon': 'table'}], 'notification': 'Success', 'filters': [{'field': 'type', 'term': 'Vendor', 'remove': '/search/?limit=all&frame=object'}], '@type': ['Search'], '@graph': [{'url': 'https://www.thermofisher.com/us/en/home/brands/thermo-scientific.html#/legacy=www.fermentas.com', '@id': '/vendors/thermofisher-scientific/', 'aliases': [], 'status': 'in review by lab', 'description': 'previously also Fermentas', 'award': '/awards/1U01CA200059-01/', 'uuid': 'b31106bc-8535-4448-903e-854af460b21f', 'lab': '/labs/4dn-dcic-lab/', 'date_created': '2016-12-08T18:31:47.847660+00:00', '@type': ['Vendor', 'Item'], 'schema_version': '1', 'title': 'ThermoFisher Scientific', 'name': 'thermofisher-scientific', 'submitted_by': '/users/986b362f-4eb6-4a9c-8173-3ab267307e3a/'}, {'url': 'https://www.neb.com', '@id': '/vendors/new-england-biolabs/', 'aliases': [], 'status': 'in review by lab', 'description': '', 'award': '/awards/1U01CA200059-01/', 'uuid': 'b31106bc-8535-4448-903e-854af460b21e', 'lab': '/labs/4dn-dcic-lab/', 'date_created': '2016-12-08T18:31:47.824418+00:00', '@type': ['Vendor', 'Item'], 'schema_version': '1', 'title': 'New England Biolabs', 'name': 'new-england-biolabs', 'submitted_by': '/users/986b362f-4eb6-4a9c-8173-3ab267307e3a/'}, {'url': 'http://www.worthington-biochem.com', '@id': '/vendors/worthington-biochemical/', 'aliases': [], 'status': 'in review by lab', 'description': '', 'award': '/awards/1U01CA200059-01/', 'uuid': 'b31106bc-8535-4448-903e-854af460b21d', 'lab': '/labs/4dn-dcic-lab/', 'date_created': '2016-12-08T18:31:47.807726+00:00', '@type': ['Vendor', 'Item'], 'schema_version': '1', 'title': 'Worthington Biochemical', 'name': 'worthington-biochemical', 'submitted_by': '/users/986b362f-4eb6-4a9c-8173-3ab267307e3a/'}], 'title': 'Search', 'total': 3, 'facets': [{'total': 3, 'title': 'Data Type', 'field': 'type', 'terms': [{'key': 'Vendor', 'doc_count': 3}, {'key': 'AccessKey', 'doc_count': 0}, {'key': 'AnalysisStep', 'doc_count': 0}, {'key': 'Award', 'doc_count': 0}, {'key': 'Biosample', 'doc_count': 0}, {'key': 'BiosampleCellCulture', 'doc_count': 0}, {'key': 'Biosource', 'doc_count': 0}, {'key': 'Construct', 'doc_count': 0}, {'key': 'Document', 'doc_count': 0}, {'key': 'Enzyme', 'doc_count': 0}, {'key': 'Experiment', 'doc_count': 0}, {'key': 'ExperimentCaptureC', 'doc_count': 0}, {'key': 'ExperimentHiC', 'doc_count': 0}, {'key': 'ExperimentRepliseq', 'doc_count': 0}, {'key': 'File', 'doc_count': 0}, {'key': 'FileFasta', 'doc_count': 0}, {'key': 'FileFastq', 'doc_count': 0}, {'key': 'FileProcessed', 'doc_count': 0}, {'key': 'FileReference', 'doc_count': 0}, {'key': 'FileSet', 'doc_count': 0}, {'key': 'Individual', 'doc_count': 0}, {'key': 'IndividualMouse', 'doc_count': 0}, {'key': 'Lab', 'doc_count': 0}, {'key': 'Modification', 'doc_count': 0}, {'key': 'Ontology', 'doc_count': 0}, {'key': 'OntologyTerm', 'doc_count': 0}, {'key': 'Organism', 'doc_count': 0}, {'key': 'Publication', 'doc_count': 0}, {'key': 'Software', 'doc_count': 0}, {'key': 'SopMap', 'doc_count': 0}, {'key': 'BioFeature', 'doc_count': 0}, {'key': 'Treatment', 'doc_count': 0}, {'key': 'TreatmentChemical', 'doc_count': 0}, {'key': 'TreatmentRnai', 'doc_count': 0}, {'key': 'User', 'doc_count': 0}, {'key': 'Workflow', 'doc_count': 0}, {'key': 'WorkflowRun', 'doc_count': 0}]}, {'total': 3, 'title': 'Audit category: DCC ACTION', 'field': 'audit.INTERNAL_ACTION.category', 'terms': [{'key': 'mismatched status', 'doc_count': 0}, {'key': 'validation error', 'doc_count': 0}, {'key': 'validation error: run_status', 'doc_count': 0}]}]}
diff --git a/tests/test_get_field_info.py b/tests/test_get_field_info.py
index e154ef6a..2c121f1e 100644
--- a/tests/test_get_field_info.py
+++ b/tests/test_get_field_info.py
@@ -1,12 +1,30 @@
 import wranglertools.get_field_info as gfi
+from wranglertools.constants import CONFDIR, DEFAULT_KEYPAIR_FILE
 import pytest
-from operator import itemgetter
 import openpyxl
 from pathlib import Path
 import os
 
 # test data is in conftest.py
 
+
+def test_gfi_get_args_required_default():
+    defaults = {
+        'type': ['all'],
+        'key': 'default',
+        'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE, 
+        'debug': False,
+        'nodesc': False,
+        'comments': False,
+        'noenums': False,
+        'outfile': 'fields.xlsx',
+        'noadmin': False
+    }
+    args = gfi.getArgs([])
+    for k, v in defaults.items():
+        assert getattr(args, k) == v
+
+
 keypairs = {
     "default":
         {
@@ -24,7 +42,7 @@ def mkey():
 
 def test_key():
     key = gfi.FDN_Key(keypairs, "default")
-    assert(key)
+    assert key
     assert isinstance(key.con_key["server"], str)
     assert isinstance(key.con_key['key'], str)
     assert isinstance(key.con_key['secret'], str)
@@ -33,16 +51,24 @@ def test_key():
 @pytest.mark.file_operation
 def test_key_file():
     key = gfi.FDN_Key('./tests/data_files/keypairs.json', "default")
-    assert(key)
+    assert key
     assert isinstance(key.con_key["server"], str)
     assert isinstance(key.con_key['key'], str)
     assert isinstance(key.con_key['secret'], str)
 
 
+def test_key_from_env(mocker):
+    #mocker.patch('wranglertools.get_field_info.os.environ.get', return_value='mock/dir')
+    #mocker.patch.object(Path, 'is_file')
+    #import pdb; pdb.set_trace()
+    key = gfi.FDN_Key('keypairs.json', 'default')
+    print(key)
+
+
 def test_key_error_wrong_format(capsys):
     gfi.FDN_Key([("key_name", "my_key")], "key_name")
     out = capsys.readouterr()[0]
-    message = "The keyfile does not exist, check the --keyfile path or add 'keypairs.json' to your home folder"
+    message =f"The keyfile [('key_name', 'my_key')] does not exist\ncheck the --keyfile path or add {DEFAULT_KEYPAIR_FILE} to {CONFDIR}"
     assert out.strip() == message
 
 
@@ -192,6 +218,20 @@ def test_connection_prompt_for_lab_award_multi_lab_award(
     assert connection.award == chosenaward
 
 
+def test_remove_all_from_types_multitypes(mocked_args_w_type):
+    assert 'all' in mocked_args_w_type.type
+    gfi._remove_all_from_types(mocked_args_w_type)
+    assert 'all' not in mocked_args_w_type.type
+    assert 'FileFastq' in mocked_args_w_type.type
+
+
+def test_remove_all_from_types_do_not_rm_when_only_type(mocked_args_w_type):
+    assert 'all' in mocked_args_w_type.type
+    gfi._remove_all_from_types(mocked_args_w_type)
+    assert 'all' not in mocked_args_w_type.type
+    assert 'FileFastq' in mocked_args_w_type.type
+
+
 def test_set_award_no_lab(mocker, mkey, returned_user_me_submit_for_one_lab,
                           returned_lab_w_one_award):
     mocker.patch('dcicutils.ff_utils.get_metadata', side_effect=[
@@ -344,6 +384,28 @@ def test_get_uploadable_fields_mock(connection_mock, mocker, returned_vendor_sch
         assert field.enum is not None
 
 
+def test_get_uploadable_fields_experiment_added_fields(connection_mock, mocker, returned_experiment_hi_c_schema):
+    added_field_names = ['*replicate_set', '*bio_rep_no', '*tec_rep_no']
+    mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_experiment_hi_c_schema.json())
+    mocker.patch('dcicutils.ff_utils.search_metadata', return_value=[
+        {"title": "single cell Methyl Hi-C"}, {"title": "Methyl Hi-C"}, {"title": "Dilution Hi-C"},
+        {"title": "DNase Hi-C"}, {"title": "Micro-C"}, {"title": "single cell Hi-C"}, {"title": "sci-Hi-C"},
+        {"title": "TCC"}, {"title": "in situ Hi-C"}, {"title": "MC-Hi-C"}, {"title": "MC-3C"}, {"title": "sn-Hi-C"}])
+    field_dict = gfi.get_uploadable_fields(connection_mock, ['ExperimentHiC'])
+    field_list = field_dict['ExperimentHiC']
+    assert len([field.name for field in field_list if field.name in added_field_names]) == len(added_field_names)
+
+
+def test_get_uploadable_fields_file_extra_files(connection_mock, mocker, returned_file_fastq_schema):
+    added_field_names = ['extra_files.file_format', 'extra_files.use_for']
+    mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_file_fastq_schema.json())
+    mocker.patch('dcicutils.ff_utils.search_metadata', return_value=[{'file_format': 'fastq'}, {'file_format': 'tar'}])
+    field_dict = gfi.get_uploadable_fields(connection_mock, ['FileFastq'])
+    field_list = field_dict['FileFastq']
+    assert len([field.name for field in field_list if field.name in added_field_names]) == len(added_field_names)
+    assert all([field.ftype.startswith('array of embedded') for field in field_list if field.name in added_field_names])
+
+
 def xls_to_list(xls_file, sheet):
     """To compare xls files to reference ones, return a sorted list of content."""
     wb = openpyxl.load_workbook(xls_file)
@@ -367,12 +429,16 @@ def test_create_xlsx_default_options(connection_mock, mocker, returned_bcc_schem
         '#Additional Info:', '#Description:', '#Field Name:', '#Field Type:', '*culture_start_date',
         '-', '-', '-', '-', '-', '-',
         'A short description of the cell culture procedure - eg. Details on culturing a preparation of K562 cells',
-        "Choices:['Yes', 'No']", "Choices:['cardiac muscle myoblast', 'cardiac muscle cell']", "Choices:['non synchronized', 'G1']",
+        "Choices:['Yes', 'No']", "Choices:['cardiac muscle myoblast', 'cardiac muscle cell']",
+        "Choices:['non synchronized', 'G1']",
         'If a culture is synchronized the cell cycle stage from which the biosample used in an experiment is prepared',
-        'Item:OntologyTerm', 'Protocols including additional culture manipulations such as stem cell differentiation or cell cycle synchronization.',
+        'Item:OntologyTerm',
+        'Protocols including additional culture manipulations such as stem cell differentiation or'
+        ' cell cycle synchronization.',
         'Relevant for pluripotent and stem cell lines - set to Yes if cells have undergone in vitro differentiation',
         'The resulting tissue or cell type for cells that have undergone differentiation.',
-        'Total number of culturing days since receiving original vial', 'YYYY-MM-DD format date for most recently thawed cell culture.',
+        'Total number of culturing days since receiving original vial',
+        'YYYY-MM-DD format date for most recently thawed cell culture.',
         'array of Item:Protocol', 'culture_duration', 'culture_harvest_date', 'description', 'in_vitro_differentiated',
         'integer', 'number', 'passage_number', 'protocols_additional', 'string', 'string', 'string', 'string', 'string',
         'synchronization_stage', 'tissue'
@@ -411,7 +477,8 @@ def test_create_xlsx_non_defaults(connection_mock, mocker, returned_bcc_schema):
     except OSError:
         pass
     mocker.patch('dcicutils.ff_utils.get_metadata', return_value=returned_bcc_schema.json())
-    field_dict = gfi.get_uploadable_fields(connection_mock, ['BiosampleCellCulture'], no_description=True, include_comments=True, no_enums=True)
+    field_dict = gfi.get_uploadable_fields(connection_mock, ['BiosampleCellCulture'], no_description=True,
+                                           include_comments=True, no_enums=True)
     gfi.create_excel(field_dict, xls_file)
     assert os.path.isfile(xls_file)
     assert xls_to_list(xls_file, "BiosampleCellCulture") == EXPECTED
@@ -445,10 +512,21 @@ def test_create_xls_lookup_order(connection_mock, mocker, returned_vendor_schema
 def test_get_sheet_names(capfd):
     input_list = ['hic', 'experi-ment_capture-c', 'TreatmentChemical', 'Biosample']
     result = gfi.get_sheet_names(input_list)
-    out, err = capfd.readouterr()
+    out, _ = capfd.readouterr()
     assert result == [
         'Protocol', 'Publication', 'Image', 'Biosource', 'BiosampleCellCulture',
         'Biosample', 'FileFastq', 'ExperimentHiC', 'ExperimentCaptureC', 'ExperimentSetReplicate'
         ]
     assert len(result) == len(list(set(result)))
     assert 'No schema found for type TreatmentChemical' in out
+
+
+def test_get_sheet_names_all():
+    from wranglertools.constants import SHEET_ORDER
+    sheet_names = SHEET_ORDER[:]
+    sheet_names.remove('ExperimentMic_Path')
+    sheet_names.remove('OntologyTerm')
+    count = len(sheet_names)
+    res = gfi.get_sheet_names(['All'])
+    assert len(res) == count
+    assert res == sheet_names
diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index abb7538e..1c18638d 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -1,6 +1,5 @@
 import pathlib as pp
 from plistlib import InvalidFileException
-from gspread.exceptions import GSpreadException
 
 import pytest
 import inspect
@@ -432,7 +431,7 @@ def test_get_workbook_gsheet(mocker):
 
 def test_get_workbook_gsheet_fail_w_no_auth():
     filename = 'http://docs.google.com/test_sheet'
-    with pytest.raises(GSpreadException):
+    with pytest.raises(Exception):
         imp.get_workbook(filename, 'gsheet')
 
 
@@ -696,7 +695,7 @@ def test_order_sorter(capsys):
     ordered_list = ['User', 'Lab', 'Document', 'IndividualMouse', 'Modification', 'Biosource',
                     'BiosampleCellCulture', 'Biosample', 'ExperimentHiC']
     message0 = "WARNING! Trouble sheet(s) are not loaded"
-    message1 = '''WARNING! Check the sheet names and the reference list "sheet_order"'''
+    message1 = '''WARNING! Check the sheet names and the list in constant "SHEET_ORDER"'''
     assert ordered_list == imp.order_sorter(test_list)
     out = capsys.readouterr()[0]
     outlist = [i.strip() for i in out.split('\n') if i]
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
index 34bc5782..ddfeba6d 100755
--- a/wranglertools/get_field_info.py
+++ b/wranglertools/get_field_info.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: latin-1 -*-
+from logging.config import DEFAULT_LOGGING_CONFIG_PORT
 import pathlib as pp
 import argparse
 from dcicutils import ff_utils
@@ -69,7 +70,7 @@ def create_common_arg_parser():
     return parser
 
 
-def getArgs():  # pragma: no cover
+def getArgs(args):  # pragma: no cover
     parser = argparse.ArgumentParser(
         parents=[create_common_arg_parser()],
         description=__doc__, epilog=EPILOG,
@@ -94,44 +95,77 @@ def getArgs():  # pragma: no cover
                         default=False,
                         action='store_true',
                         help="Will set an admin user to non-admin for generating sheets")
-    args = parser.parse_args()
+    args = parser.parse_args(args)
     _remove_all_from_types(args)
     return args
 
 
 class FDN_Key:
     def __init__(self, keyfile, keyname):
+        ''' If key info is not provided as a dictionary then
+            We want to try to get the key info in the following order
+            1) if a keyfile param is provided (not the default value) 
+                then look there for the keys first
+                - using keyname if provided or 
+                - look for a "default" entry if not
+            2) check to see if the ENV_VAR_DIR env variable is set and in this
+                case we expect to find a file named keypairs.json with keys
+                to use there where we will look for key named keyname or "default"
+            3) finally look for keypairs.json in the .submit4dn directory and in home
+                directory
+        '''
         self.error = False
         keys = None
-        envdir = os.environ.get(ENV_VAR_DIR)
-        # is the keyfile a dictionary
-        if isinstance(keyfile, dict):
+        default_location = str(CONFDIR.joinpath(DEFAULT_KEYPAIR_FILE))
+        if not keyfile:  # this should not happen as defaults are supplied in gfi or imp but just in case
+            msg = "keyfile parameter missing"
+            self.set_error(msg)
+            return
+        elif isinstance(keyfile, dict):  # is the keyfile a dictionary
             keys = keyfile
-        else:
+        elif str(keyfile) != default_location:
+            fpath = pp.Path(str(keyfile))
+            if not fpath.is_file():
+                msg = f"\nThe keyfile {keyfile} does not exist\ncheck the --keyfile path or add {DEFAULT_KEYPAIR_FILE} to {CONFDIR}\n"
+                self.set_error(msg)
+                return
+        else:  # default keyfile arg has been passed 
+            envdir = os.environ.get(ENV_VAR_DIR)
             if envdir:  # loc of keypairs.json specified in env var
-                keyfile = envdir
-            else:  # check if file
-                fpath = pp.Path(str(keyfile))
+                fpath = pp.Path(envdir).joinpath(DEFAULT_KEYPAIR_FILE) 
+                if not fpath.is_file():
+                    msg = f"\n{envdir} directory set as an env variable does not contain {DEFAULT_KEYPAIR_FILE}\n"
+                    self.set_error(msg)
+                    return
+            else:
+                # see if file found in default location
+                fpath = pp.Path(keyfile)
                 if not fpath.is_file():
                     # maybe it's stored in the old default home dir
                     fpath = HOME.joinpath(DEFAULT_KEYPAIR_FILE)
                     if not fpath.is_file():
-                        print("\nThe keyfile does not exist\n"
-                              f"check the --keyfile path or add 'keypairs.json' to {CONFDIR}\n")
-                        self.error = True
+                        msg = f"\nThe keyfile does not exist! Add {DEFAULT_KEYPAIR_FILE} to {CONFDIR} or use the --keyfile option\n"
+                        self.set_error(msg)
                         return
 
+        if not keys and fpath:
             with open(fpath, 'r') as keys_f:
                 keys_json_string = keys_f.read()
-                keys = json.loads(keys_json_string)
+            keys = json.loads(keys_json_string)
+        
         try:
             self.con_key = keys[keyname]
         except KeyError:
-            print(f"ERROR: No key with {keyname} found - check your keypairs file")
-            sys.exit(1)
+            msg = f"ERROR: No key with {keyname} found - check your keypairs file"
+            self.set_error(msg)
+            return
         if not self.con_key['server'].endswith("/"):
             self.con_key['server'] += "/"
 
+    def set_error(self, msg):
+        print(msg)
+        self.error = True
+
 
 class FDN_Connection(object):
     def __init__(self, key4dn):
@@ -236,16 +270,9 @@ class FieldInfo(object):
     enum = attr.ib(default=u'')
 
 
-# additional fields for experiment sheets to capture experiment_set related information
-exp_set_addition = [FieldInfo('*replicate_set', 'Item:ExperimentSetReplicate', 3, 'Grouping for replicate experiments'),
-                    FieldInfo('*bio_rep_no', 'integer', 4, 'Biological replicate number'),
-                    FieldInfo('*tec_rep_no', 'integer', 5, 'Technical replicate number'),
-                    ]
-
-
-file_types = [i for i in SHEET_ORDER if i.startswith('File') and not i.startswith('FileSet')]
-file_types.remove('FileFormat')
-exp_types = [i for i in SHEET_ORDER if i.startswith('Experiment') and 'Type' not in i and 'Set' not in i]
+# file_types = [i for i in SHEET_ORDER if i.startswith('File') and not (i.startswith('FileSet') or i == 'FileFormat')]
+# file_types.remove('FileFormat')
+# exp_types = [i for i in SHEET_ORDER if i.startswith('Experiment') and 'Type' not in i and 'Set' not in i]
 
 
 def get_field_type(field):
@@ -338,23 +365,33 @@ def build_field_list(properties, required_fields=None, no_description=False,
 
 
 class FDN_Schema(object):
+    file_types = [i for i in SHEET_ORDER if i.startswith('File') and not (i.startswith('FileSet') or i == 'FileFormat')]
+    exp_types = [i for i in SHEET_ORDER if i.startswith('Experiment') and 'Type' not in i and 'Set' not in i]
+
     def __init__(self, connection, schema_name):
         uri = '/profiles/' + schema_name + '.json'
         response = ff_utils.get_metadata(uri, key=connection.key, add_on="frame=object")
         self.required = None
         if 'required' in response:
             self.required = response['required']
-        if schema_name in file_types and response['properties'].get('file_format'):
+        if schema_name in FDN_Schema.file_types and response['properties'].get('file_format'):
             q = '/search/?type=FileFormat&field=file_format&valid_item_types={}'.format(schema_name)
             formats = [i['file_format'] for i in ff_utils.search_metadata(q, key=connection.key)]
             response['properties']['file_format']['enum'] = formats
-        elif schema_name in exp_types and response['properties'].get('experiment_type'):
+        elif schema_name in FDN_Schema.exp_types and response['properties'].get('experiment_type'):
             q = '/search/?type=ExperimentType&field=title&valid_item_types={}'.format(schema_name)
             exptypes = [i['title'] for i in ff_utils.search_metadata(q, key=connection.key)]
             response['properties']['experiment_type']['enum'] = exptypes
         self.properties = response['properties']
 
 
+# additional fields for experiment sheets to capture experiment_set related information
+exp_set_addition = [FieldInfo('*replicate_set', 'Item:ExperimentSetReplicate', 3, 'Grouping for replicate experiments'),
+                    FieldInfo('*bio_rep_no', 'integer', 4, 'Biological replicate number'),
+                    FieldInfo('*tec_rep_no', 'integer', 5, 'Technical replicate number'),
+                    ]
+
+
 def get_uploadable_fields(connection, types, no_description=False,
                           include_comments=False, no_enums=False):
     fields = {}
@@ -453,7 +490,7 @@ def get_sheet_names(types_list):
 
 
 def main():  # pragma: no cover
-    args = getArgs()
+    args = getArgs(sys.argv[1:])  # the sys.argv bit is for testing purposes
     key = FDN_Key(args.keyfile, args.key)
     if key.error:
         sys.exit(1)
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index 73a6bc29..a23a84b8 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -26,7 +26,7 @@
 from gspread.exceptions import GSpreadException
 from openpyxl.utils.exceptions import InvalidFileException
 from wranglertools.constants import (
-    CONFDIR, GSID_REGEX, SHEET_ORDER, LIST_OF_LOADXL_FIELDS, ENV_VAR_DIR, GCRED_FNAME,
+    CONFDIR, SHEET_ORDER, LIST_OF_LOADXL_FIELDS, ENV_VAR_DIR, GCRED_FNAME,
     AUTH_TOKEN_FNAME, SCOPES, GSHEET, EXCEL, ZIP_MIME, XLSX_MIME, ALLOWED_MIMES,
     GSHEET_URL_REGEX, GSID_REGEX
 )

From 25e3f897fc103ca771a09bfd1a32c22796bec786 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Wed, 2 Nov 2022 15:36:07 -0400
Subject: [PATCH 10/14] added gfi tests

---
 tests/test_get_field_info.py    | 119 +++++++++++++++++++++++++++++---
 wranglertools/get_field_info.py |   6 +-
 2 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/tests/test_get_field_info.py b/tests/test_get_field_info.py
index 2c121f1e..941183a0 100644
--- a/tests/test_get_field_info.py
+++ b/tests/test_get_field_info.py
@@ -40,7 +40,7 @@ def mkey():
     return gfi.FDN_Key(keypairs, "default")
 
 
-def test_key():
+def test_key_as_dict():
     key = gfi.FDN_Key(keypairs, "default")
     assert key
     assert isinstance(key.con_key["server"], str)
@@ -48,21 +48,116 @@ def test_key():
     assert isinstance(key.con_key['secret'], str)
 
 
+@pytest.fixture
+def keydirname():
+    return './tests/data_files/'
+
+@pytest.fixture
+def keydir(keydirname):
+    return Path(keydirname)
+
+@pytest.fixture
+def keyfilename():
+    return 'keypairs.json'
+
+
+@pytest.fixture
+def keypath(keydir, keyfilename):
+    return keydir.joinpath(keyfilename)
+
+
+@pytest.fixture
+def missing_dirname():
+    return './missing/keydir/'
+
+
+@pytest.fixture
+def missing_dir(missing_dirname):
+    return Path(missing_dirname)
+
+@pytest.mark.file_operation
+def test_key_file(keypath):
+    ''' testing when an actual keyfile path is provided as per --keyfile option'''
+    key = gfi.FDN_Key(keypath, "default")
+    assert key
+    assert isinstance(key.con_key["server"], str)
+    assert isinstance(key.con_key['key'], str)
+    assert isinstance(key.con_key['secret'], str)
+
+
+@pytest.mark.file_operation
+def test_key_from_env(mocker, keydirname):
+    ''' testing getting directory where keypairs.json is stored when directory location
+        is set in an enviromental var - by mocking os.environ.get function
+        to hit this clause the expected default keypath must be passed to the constructor'''
+    default_keypath = CONFDIR / DEFAULT_KEYPAIR_FILE
+    mocker.patch('wranglertools.get_field_info.os.environ.get', return_value=keydirname)
+    key = gfi.FDN_Key(default_keypath, 'default')
+    assert key
+    assert isinstance(key.con_key["server"], str)
+    assert isinstance(key.con_key['key'], str)
+    assert isinstance(key.con_key['secret'], str)
+
+
+def test_key_from_env_set_wrong(mocker, capsys):
+    ''' testing when directory location is set in an enviromental var and the expected 'keypairs.json'
+        is not found in the director - by mocking os.environ.get function
+        to hit this clause the expected default keypath must be passed to the constructor'''
+    default_keypath = CONFDIR / DEFAULT_KEYPAIR_FILE
+    baddir = 'some/other/name/'
+    mocker.patch('wranglertools.get_field_info.os.environ.get', return_value=baddir)
+    # import pdb; pdb.set_trace()
+    key = gfi.FDN_Key(default_keypath, 'default')
+    out = capsys.readouterr()[0]
+    assert key.error
+    assert out == f'\n{baddir} directory set as an env variable does not contain {DEFAULT_KEYPAIR_FILE}\n\n'
+
+
 @pytest.mark.file_operation
-def test_key_file():
-    key = gfi.FDN_Key('./tests/data_files/keypairs.json', "default")
+def test_key_from_default_location(mocker, keydir, keydirname, keyfilename):
+    '''little bit wonky as we are "mocking" the default location to be where the test file is stored
+        by over-riding the constant'''
+    mocker.patch("wranglertools.get_field_info.CONFDIR", keydir)
+    default_keypath = keydirname + keyfilename
+    key = gfi.FDN_Key(default_keypath, 'default')
     assert key
     assert isinstance(key.con_key["server"], str)
     assert isinstance(key.con_key['key'], str)
     assert isinstance(key.con_key['secret'], str)
 
 
-def test_key_from_env(mocker):
-    #mocker.patch('wranglertools.get_field_info.os.environ.get', return_value='mock/dir')
-    #mocker.patch.object(Path, 'is_file')
-    #import pdb; pdb.set_trace()
-    key = gfi.FDN_Key('keypairs.json', 'default')
-    print(key)
+@pytest.mark.file_operation
+def test_key_from_home_location(mocker, keydir, keydirname, keyfilename):
+    '''little bit wonky as we are "mocking" the default location to be where the test file is stored
+        by over-riding the constant'''
+    mocker.patch("wranglertools.get_field_info.HOME", keydir)
+    default_keypath = keydirname + keyfilename
+    key = gfi.FDN_Key(default_keypath, 'default')
+    assert key
+    assert isinstance(key.con_key["server"], str)
+    assert isinstance(key.con_key['key'], str)
+    assert isinstance(key.con_key['secret'], str)
+
+
+def test_key_default_file_missing(mocker, capsys, missing_dir, missing_dirname, keyfilename):
+    ''' in this case we are mocking the default filename so it's not found'''
+    mocker.patch("wranglertools.get_field_info.CONFDIR", missing_dir)
+    mocker.patch("wranglertools.get_field_info.HOME", missing_dir)
+    mocker.patch('wranglertools.get_field_info.os.environ.get', return_value=None)
+    default_keypath = missing_dirname + keyfilename
+    key = gfi.FDN_Key(str(default_keypath), 'default')
+    out = capsys.readouterr()[0]
+    assert key.error
+    assert out == f"\nThe keyfile does not exist! Add keypairs.json to {missing_dir} or use the --keyfile option\n\n"
+
+
+def test_key_no_keyfile(capsys):
+    ''' this is testing something that should not be possible when running get_field_info but if using FDN_Key
+        in another context/script this could be relevant
+    '''
+    gfi.FDN_Key(None, 'default')
+    out = capsys.readouterr()[0]
+    assert out == "keyfile parameter missing\n"
 
 
 def test_key_error_wrong_format(capsys):
@@ -72,6 +167,12 @@ def test_key_error_wrong_format(capsys):
     assert out.strip() == message
 
 
+def test_key_error_bad_keyname(capsys):
+    key = gfi.FDN_Key(keypairs, "nosuchkey")
+    out = capsys.readouterr()[0]
+    assert key.error
+    assert out == "ERROR: No key with name 'nosuchkey' found - check your keypairs file\n"
+
 def bad_connection_will_exit():
     with pytest.raises(SystemExit) as excinfo:
         keypairs = {
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
index ddfeba6d..1869de00 100755
--- a/wranglertools/get_field_info.py
+++ b/wranglertools/get_field_info.py
@@ -116,14 +116,14 @@ def __init__(self, keyfile, keyname):
         '''
         self.error = False
         keys = None
-        default_location = str(CONFDIR.joinpath(DEFAULT_KEYPAIR_FILE))
+        default_location = CONFDIR.joinpath(DEFAULT_KEYPAIR_FILE)
         if not keyfile:  # this should not happen as defaults are supplied in gfi or imp but just in case
             msg = "keyfile parameter missing"
             self.set_error(msg)
             return
         elif isinstance(keyfile, dict):  # is the keyfile a dictionary
             keys = keyfile
-        elif str(keyfile) != default_location:
+        elif pp.Path(str(keyfile)) != default_location:
             fpath = pp.Path(str(keyfile))
             if not fpath.is_file():
                 msg = f"\nThe keyfile {keyfile} does not exist\ncheck the --keyfile path or add {DEFAULT_KEYPAIR_FILE} to {CONFDIR}\n"
@@ -156,7 +156,7 @@ def __init__(self, keyfile, keyname):
         try:
             self.con_key = keys[keyname]
         except KeyError:
-            msg = f"ERROR: No key with {keyname} found - check your keypairs file"
+            msg = f"ERROR: No key with name '{keyname}' found - check your keypairs file"
             self.set_error(msg)
             return
         if not self.con_key['server'].endswith("/"):

From b41dd749e5155876e0fbaa54f8a2c17cb42551f6 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Fri, 4 Nov 2022 12:00:27 -0400
Subject: [PATCH 11/14] fixing some tests

---
 tests/test_import_data.py       | 20 +++++++++++++++++++-
 wranglertools/get_field_info.py |  2 +-
 wranglertools/import_data.py    |  6 +++---
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index 1c18638d..636ee976 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -1,6 +1,6 @@
 import pathlib as pp
 from plistlib import InvalidFileException
-
+from wranglertools.constants import CONFDIR, DEFAULT_KEYPAIR_FILE
 import pytest
 import inspect
 import wranglertools.import_data as imp
@@ -9,6 +9,24 @@
 # test data is in conftest.py
 
 
+def test_imp_get_args_required_default():
+    defaults = {
+        'type': ['all'],
+        'key': 'default',
+        'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE, 
+        'debug': False,
+        'update': False,
+        'patchall': False,
+        'remote': False,
+        'novalidate': False,
+    }
+    args = imp.getArgs(['infile'])
+    for k, v in defaults.items():
+        assert getattr(args, k) == v
+    assert args.infile == 'infile'
+
+
+
 def convert_to_path_with_tilde(string_path):
     """Somehow the inverse of pathlib.Path.expanduser(). Helper function used
     to generate valid paths containing ~ """
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
index 1869de00..c1190690 100755
--- a/wranglertools/get_field_info.py
+++ b/wranglertools/get_field_info.py
@@ -130,7 +130,7 @@ def __init__(self, keyfile, keyname):
                 self.set_error(msg)
                 return
         else:  # default keyfile arg has been passed 
-            envdir = os.environ.get(ENV_VAR_DIR)
+            envdir = os.environ.get(p ENV_VAR_DIR)
             if envdir:  # loc of keypairs.json specified in env var
                 fpath = pp.Path(envdir).joinpath(DEFAULT_KEYPAIR_FILE) 
                 if not fpath.is_file():
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index a23a84b8..a47118a3 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -35,7 +35,7 @@
                                           create_common_arg_parser)
 
 
-def getArgs():  # pragma: no cover
+def getArgs(args):
     parser = argparse.ArgumentParser(
         parents=[create_common_arg_parser()],
         description=__doc__, epilog=EPILOG,
@@ -70,7 +70,7 @@ def getArgs():  # pragma: no cover
                         default=False,
                         action='store_true',
                         help="Will skip pre-validation of workbook")
-    args = parser.parse_args()
+    args = parser.parse_args(args)
     _remove_all_from_types(args)
     return args
 
@@ -1646,7 +1646,7 @@ def get_all_aliases(workbook, sheets, booktype):
 
 
 def main():  # pragma: no cover
-    args = getArgs()
+    args = getArgs(sys.argv[1:])  # the sys.argv bit is for testing purposes
     key = FDN_Key(args.keyfile, args.key)
     # check if key has error
     if key.error:

From 98ce013759f8902ae73b411f1c25a418544f5c4d Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Sat, 5 Nov 2022 09:33:42 -0400
Subject: [PATCH 12/14] changed some constant names

---
 tests/test_get_field_info.py    | 3 ++-
 wranglertools/constants.py      | 2 +-
 wranglertools/get_field_info.py | 8 ++++----
 wranglertools/import_data.py    | 4 ++--
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/test_get_field_info.py b/tests/test_get_field_info.py
index 941183a0..d3df63d9 100644
--- a/tests/test_get_field_info.py
+++ b/tests/test_get_field_info.py
@@ -163,7 +163,8 @@ def test_key_no_keyfile(capsys):
 def test_key_error_wrong_format(capsys):
     gfi.FDN_Key([("key_name", "my_key")], "key_name")
     out = capsys.readouterr()[0]
-    message =f"The keyfile [('key_name', 'my_key')] does not exist\ncheck the --keyfile path or add {DEFAULT_KEYPAIR_FILE} to {CONFDIR}"
+    message = (f"The keyfile [('key_name', 'my_key')] does not exist\n"
+               f"check the --keyfile path or add {DEFAULT_KEYPAIR_FILE} to {CONFDIR}")
     assert out.strip() == message
 
 
diff --git a/wranglertools/constants.py b/wranglertools/constants.py
index 459f2b9e..522c6f07 100755
--- a/wranglertools/constants.py
+++ b/wranglertools/constants.py
@@ -10,7 +10,7 @@
 HOME = pp.Path.home()
 CONFDIR = HOME.joinpath('.submit4dn')
 DEFAULT_KEYPAIR_FILE = 'keypairs.json'
-ENV_VAR_DIR = 'SUBMIT_4DN_CONF_DIR'
+CONFDIR_ENVVAR = 'SUBMIT_4DN_CONFDIR'
 
 SHEET_ORDER = [
     "User", "Award", "Lab", "Document", "Protocol", "ExperimentType",
diff --git a/wranglertools/get_field_info.py b/wranglertools/get_field_info.py
index c1190690..f1118470 100755
--- a/wranglertools/get_field_info.py
+++ b/wranglertools/get_field_info.py
@@ -11,7 +11,7 @@
 import json
 
 from wranglertools.constants import (
-    HOME, CONFDIR, ENV_VAR_DIR, DEFAULT_KEYPAIR_FILE, SHEET_ORDER
+    HOME, CONFDIR, CONFDIR_ENVVAR, DEFAULT_KEYPAIR_FILE, SHEET_ORDER
 )
 
 
@@ -108,12 +108,12 @@ def __init__(self, keyfile, keyname):
                 then look there for the keys first
                 - using keyname if provided or 
                 - look for a "default" entry if not
-            2) check to see if the ENV_VAR_DIR env variable is set and in this
+            2) check to see if the CONFDIR_ENVVAR env variable is set and in this
                 case we expect to find a file named keypairs.json with keys
                 to use there where we will look for key named keyname or "default"
             3) finally look for keypairs.json in the .submit4dn directory and in home
                 directory
-        '''
+        '''        
         self.error = False
         keys = None
         default_location = CONFDIR.joinpath(DEFAULT_KEYPAIR_FILE)
@@ -130,7 +130,7 @@ def __init__(self, keyfile, keyname):
                 self.set_error(msg)
                 return
         else:  # default keyfile arg has been passed 
-            envdir = os.environ.get(p ENV_VAR_DIR)
+            envdir = os.environ.get(CONFDIR_ENVVAR)
             if envdir:  # loc of keypairs.json specified in env var
                 fpath = pp.Path(envdir).joinpath(DEFAULT_KEYPAIR_FILE) 
                 if not fpath.is_file():
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index a47118a3..5167352e 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -26,7 +26,7 @@
 from gspread.exceptions import GSpreadException
 from openpyxl.utils.exceptions import InvalidFileException
 from wranglertools.constants import (
-    CONFDIR, SHEET_ORDER, LIST_OF_LOADXL_FIELDS, ENV_VAR_DIR, GCRED_FNAME,
+    CONFDIR, SHEET_ORDER, LIST_OF_LOADXL_FIELDS, CONFDIR_ENVVAR, GCRED_FNAME,
     AUTH_TOKEN_FNAME, SCOPES, GSHEET, EXCEL, ZIP_MIME, XLSX_MIME, ALLOWED_MIMES,
     GSHEET_URL_REGEX, GSID_REGEX
 )
@@ -129,7 +129,7 @@ class WebFetchException(Exception):
 
 def google_authenticate():
     gsauth = None
-    ga_cred_env = os.environ.get(ENV_VAR_DIR)  # look to see if set as env variable
+    ga_cred_env = os.environ.get(CONFDIR_ENVVAR)  # look to see if set as env variable
     # default to .submit4dn dir in home dir
     creddir = pp.Path(ga_cred_env) if ga_cred_env else CONFDIR
     try:

From 29bce7faba52b3af5505fdd5688ae98d9b2f35af Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Mon, 7 Nov 2022 15:40:42 -0500
Subject: [PATCH 13/14] added tests for google_authenticate and fixed a bug
 they spotted

---
 tests/test_import_data.py    | 19 +++++++++++++++++--
 wranglertools/import_data.py |  6 +++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index 636ee976..63912fb1 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -5,6 +5,7 @@
 import inspect
 import wranglertools.import_data as imp
 from tests.conftest import MockedGoogleWorkSheet, MockedGoogleWorkBook, MockedGauth
+from gspread.exceptions import GSpreadException
 
 # test data is in conftest.py
 
@@ -13,7 +14,7 @@ def test_imp_get_args_required_default():
     defaults = {
         'type': ['all'],
         'key': 'default',
-        'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE, 
+        'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE,
         'debug': False,
         'update': False,
         'patchall': False,
@@ -26,7 +27,6 @@ def test_imp_get_args_required_default():
     assert args.infile == 'infile'
 
 
-
 def convert_to_path_with_tilde(string_path):
     """Somehow the inverse of pathlib.Path.expanduser(). Helper function used
     to generate valid paths containing ~ """
@@ -143,6 +143,21 @@ def test_reader_wrong_sheetname(capsys):
     assert out == msg
 
 
+def test_google_authenticate_mock(mocker):
+    mocker.patch('wranglertools.import_data.os.environ.get', return_value=None)
+    mocker.patch('wranglertools.import_data.gspread.oauth', return_value=True)
+    gauth = imp.google_authenticate()
+    assert gauth
+
+
+def test_google_authenticate_exception(mocker, capsys):
+    mocker.patch('wranglertools.import_data.os.environ.get', return_value=None)
+    mocker.patch("wranglertools.import_data.CONFDIR", pp.Path('no/such/directory'))
+    imp.google_authenticate()
+    out = capsys.readouterr()[0]
+    assert out.startswith('GOOGLE AUTH PROBLEM:')
+
+
 @pytest.fixture
 def gs_test_data():
     return {'row1': ['a', 'b', 'c'], 'row2': ['d', 'e', 'f']}
diff --git a/wranglertools/import_data.py b/wranglertools/import_data.py
index 5167352e..726eb746 100755
--- a/wranglertools/import_data.py
+++ b/wranglertools/import_data.py
@@ -138,8 +138,8 @@ def google_authenticate():
             authorized_user_filename=creddir.joinpath(AUTH_TOKEN_FNAME),
             scopes=SCOPES
         )
-    except GSpreadException as gse:
-        raise f"GOOGLE AUTH PROBLEM: {gse}"
+    except (GSpreadException, FileNotFoundError) as gse:
+        print(f"GOOGLE AUTH PROBLEM: {gse}")
     return gsauth
 
 
@@ -1093,7 +1093,7 @@ def workbook_reader(workbook, booktype, sheet, update, connection, patchall, ali
 
     if sheet == "FileFastq" and not novalidate:
         # check for consistent file pairing of fastqs in the sheet
-        pair_errs = check_file_pairing(reader(workbook, sheetname=sheet))
+        pair_errs = check_file_pairing(reader(workbook, sheetname=sheet, booktype=booktype))
         for f, err in sorted(pair_errs.items()):
             for e in err:
                 print('WARNING: ', f, '\t', e)

From 06c3d70caf2523fcacea2a81f931f8f2c79540de Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Mon, 7 Nov 2022 15:43:32 -0500
Subject: [PATCH 14/14] couple of flake8 fixes

---
 tests/test_get_field_info.py | 7 +++++--
 tests/test_import_data.py    | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_get_field_info.py b/tests/test_get_field_info.py
index d3df63d9..93ba3564 100644
--- a/tests/test_get_field_info.py
+++ b/tests/test_get_field_info.py
@@ -12,7 +12,7 @@ def test_gfi_get_args_required_default():
     defaults = {
         'type': ['all'],
         'key': 'default',
-        'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE, 
+        'keyfile': CONFDIR / DEFAULT_KEYPAIR_FILE,
         'debug': False,
         'nodesc': False,
         'comments': False,
@@ -52,10 +52,12 @@ def test_key_as_dict():
 def keydirname():
     return './tests/data_files/'
 
+
 @pytest.fixture
 def keydir(keydirname):
     return Path(keydirname)
 
+
 @pytest.fixture
 def keyfilename():
     return 'keypairs.json'
@@ -75,6 +77,7 @@ def missing_dirname():
 def missing_dir(missing_dirname):
     return Path(missing_dirname)
 
+
 @pytest.mark.file_operation
 def test_key_file(keypath):
     ''' testing when an actual keyfile path is provided as per --keyfile option'''
@@ -106,7 +109,6 @@ def test_key_from_env_set_wrong(mocker, capsys):
     default_keypath = CONFDIR / DEFAULT_KEYPAIR_FILE
     baddir = 'some/other/name/'
     mocker.patch('wranglertools.get_field_info.os.environ.get', return_value=baddir)
-    # import pdb; pdb.set_trace()
     key = gfi.FDN_Key(default_keypath, 'default')
     out = capsys.readouterr()[0]
     assert key.error
@@ -174,6 +176,7 @@ def test_key_error_bad_keyname(capsys):
     assert key.error
     assert out == "ERROR: No key with name 'nosuchkey' found - check your keypairs file\n"
 
+
 def bad_connection_will_exit():
     with pytest.raises(SystemExit) as excinfo:
         keypairs = {
diff --git a/tests/test_import_data.py b/tests/test_import_data.py
index 63912fb1..8988a0b6 100644
--- a/tests/test_import_data.py
+++ b/tests/test_import_data.py
@@ -5,8 +5,6 @@
 import inspect
 import wranglertools.import_data as imp
 from tests.conftest import MockedGoogleWorkSheet, MockedGoogleWorkBook, MockedGauth
-from gspread.exceptions import GSpreadException
-
 # test data is in conftest.py