From 92aa1b971dfa0c7a05dd19149a01d33c9923ed0b Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Wed, 2 Dec 2020 16:04:00 -0500 Subject: [PATCH 01/22] commit before removing big long SQL --- crc/api.yml | 17 ++++++++++++ crc/api/workflow.py | 67 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index ad22677e..44138510 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -100,6 +100,23 @@ paths: type: array items: $ref: "#/components/schemas/Study" + /workflow_spec/all: + get: + operationId: crc.api.workflow.get_all_spec_state + summary: Provides a list of workflow specs and their signature + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + + /study/all: get: operationId: crc.api.study.all_studies diff --git a/crc/api/workflow.py b/crc/api/workflow.py index a148328a..37a5059a 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -1,11 +1,15 @@ +import hashlib +import json import uuid +from hashlib import md5 +import pandas as pd from SpiffWorkflow.util.deep_merge import DeepMerge from flask import g -from crc import session, app +from crc import session, db from crc.api.common import ApiError, ApiErrorSchema from crc.models.api_models import WorkflowApi, WorkflowApiSchema, NavigationItem, NavigationItemSchema -from crc.models.file import FileModel, LookupDataSchema +from crc.models.file import FileModel, LookupDataSchema, FileDataModel from crc.models.study import StudyModel, WorkflowMetadata from crc.models.task_event import TaskEventModel, TaskEventModelSchema, TaskEvent, TaskEventSchema from crc.models.workflow import WorkflowModel, WorkflowSpecModelSchema, WorkflowSpecModel, WorkflowSpecCategoryModel, \ @@ -257,3 +261,62 @@ def _verify_user_and_role(processor, spiff_task): raise ApiError.from_task("permission_denied", f"This task must be completed by '{allowed_users}', " f"but you are {user.uid}", spiff_task) +def join_uuids(uuids): + """Joins a pandas Series of uuids and combines them in one hash""" + combined_uuids = ''.join([str(uuid) for uuid in uuids.sort_values()]) # ensure that values are always + # in the same order + return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes + +def get_all_spec_state(): +# big_nasty_sql = """ +# +# with first as ( +# select file_model_id,version,max(version) over (partition by file_model_id) maxversion,f +# .name, +# workflow_spec_id,date_created,md5_hash +# from file f join file_data fd on fd. +# file_model_id = f.id +# where workflow_spec_id is not null +# order by workflow_spec_id,file_model_id,version desc), +# second as (select name,workflow_spec_id,date_created,md5_hash from first +# where maxversion = version) +# select row_to_json(t) workflow_spec_state +# from ( +# select workflow_spec_id,max(date_created) last_update,md5(max( +# ( +# select array_to_json(array_agg(row_to_json(d))) +# from (select name,md5_hash from second where second.workflow_spec_id = second2.workflow_spec_id) d +# +# )::text)) as files_hash +# from second as second2 +# group by 1 +# ) t +# """ +# json_results = [] +# result = db.engine.execute(big_nasty_sql) + x = session.query(FileDataModel).join(FileModel) + + # there might be a cleaner way of getting a data frome from some of the + # fields in the ORM - but this works OK + filelist = [] + for file in x: + filelist.append({'file_model_id':file.file_model_id, + 'workflow_spec_id': file.file_model.workflow_spec_id, + 'md5_hash':file.md5_hash, + 'filename':file.file_model.name, + 'date_created':file.date_created}) + df = pd.DataFrame(filelist) + + # get a distinct list of file_model_id's with the most recent file_data retained + df = df.sort_values('date_created').drop_duplicates(['file_model_id'],keep='last').copy() + + # take that list and then group by workflow_spec and retain the most recently touched file + # and make a consolidated hash of the md5_checksums - this acts as a 'thumbprint' for each + # workflow spec + df = df.groupby('workflow_spec_id').agg({'date_created':'max', + 'md5_hash':join_uuids}).copy() + + df = df.reset_index()[['workflow_spec_id','date_created','md5_hash']].copy() + + return df.to_json(orient='records') + From 0e8913434abe61eddc21e92f4485bd92b4e904b0 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Thu, 3 Dec 2020 08:44:15 -0500 Subject: [PATCH 02/22] refactor a bit --- crc/api/workflow.py | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 37a5059a..7f517e22 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -267,33 +267,16 @@ def join_uuids(uuids): # in the same order return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes +@cross_origin() # allow all cross origin requests so we can hit it with a dev box def get_all_spec_state(): -# big_nasty_sql = """ -# -# with first as ( -# select file_model_id,version,max(version) over (partition by file_model_id) maxversion,f -# .name, -# workflow_spec_id,date_created,md5_hash -# from file f join file_data fd on fd. -# file_model_id = f.id -# where workflow_spec_id is not null -# order by workflow_spec_id,file_model_id,version desc), -# second as (select name,workflow_spec_id,date_created,md5_hash from first -# where maxversion = version) -# select row_to_json(t) workflow_spec_state -# from ( -# select workflow_spec_id,max(date_created) last_update,md5(max( -# ( -# select array_to_json(array_agg(row_to_json(d))) -# from (select name,md5_hash from second where second.workflow_spec_id = second2.workflow_spec_id) d -# -# )::text)) as files_hash -# from second as second2 -# group by 1 -# ) t -# """ -# json_results = [] -# result = db.engine.execute(big_nasty_sql) + df = get_all_spec_state_dataframe() + return df.reset_index().to_json(orient='records') + +def get_all_spec_state_dataframe(): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + """ x = session.query(FileDataModel).join(FileModel) # there might be a cleaner way of getting a data frome from some of the @@ -315,8 +298,9 @@ def get_all_spec_state(): # workflow spec df = df.groupby('workflow_spec_id').agg({'date_created':'max', 'md5_hash':join_uuids}).copy() + df = df[['date_created','md5_hash']].copy() + df['date_created'] = df['date_created'].astype('str') - df = df.reset_index()[['workflow_spec_id','date_created','md5_hash']].copy() - return df.to_json(orient='records') + return df From bcb45a59c8e42a65dcbd93d3c861273626d854d1 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Thu, 3 Dec 2020 08:46:34 -0500 Subject: [PATCH 03/22] allow cors --- crc/api/workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 7f517e22..a5f2d3b1 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -20,7 +20,7 @@ from crc.services.study_service import StudyService from crc.services.user_service import UserService from crc.services.workflow_processor import WorkflowProcessor from crc.services.workflow_service import WorkflowService - +from flask_cors import cross_origin def all_specifications(): schema = WorkflowSpecModelSchema(many=True) @@ -267,7 +267,7 @@ def join_uuids(uuids): # in the same order return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes -@cross_origin() # allow all cross origin requests so we can hit it with a dev box +@cross_origin() # allow even dev boxes to hit this without restrictions def get_all_spec_state(): df = get_all_spec_state_dataframe() return df.reset_index().to_json(orient='records') From 0f59d3de096f2f938e601d6ef75ffc62611f5e3c Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Thu, 3 Dec 2020 14:45:57 -0500 Subject: [PATCH 04/22] add endpoint that gets a record for all changed /new workflow_specs at a remote endpoint --- crc/api.yml | 28 ++++++++++++++++++++++++++++ crc/api/workflow.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index 44138510..959d6061 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -100,10 +100,38 @@ paths: type: array items: $ref: "#/components/schemas/Study" + /workflow_spec/diff: + get: + operationId: crc.api.workflow.get_changed_workflows + summary: Provides a list of workflow specs and their signature + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint + parameters: + - name: remote + in: query + required: true + description: The remote endpoint + schema: + type: string + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + + /workflow_spec/all: get: operationId: crc.api.workflow.get_all_spec_state summary: Provides a list of workflow specs and their signature + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint tags: - Workflow Spec States responses: diff --git a/crc/api/workflow.py b/crc/api/workflow.py index a5f2d3b1..d899d4fb 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -21,6 +21,7 @@ from crc.services.user_service import UserService from crc.services.workflow_processor import WorkflowProcessor from crc.services.workflow_service import WorkflowService from flask_cors import cross_origin +import requests def all_specifications(): schema = WorkflowSpecModelSchema(many=True) @@ -267,10 +268,44 @@ def join_uuids(uuids): # in the same order return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes -@cross_origin() # allow even dev boxes to hit this without restrictions +def get_changed_workflows(remote): + """ + gets a remote endpoint - gets the workflows and then + determines what workflows are different from the remote endpoint + """ + x = requests.get('http://'+remote+'/v1.0/workflow_spec/all') + + # This is probably very and may allow cross site attacks - fix later + remote = pd.DataFrame(eval(x.text)) + local = get_all_spec_state_dataframe().reset_index() + different = remote.merge(local, right_on=['workflow_spec_id','md5_hash'], left_on=['workflow_spec_id', + 'md5_hash'], how = 'outer' , indicator=True).loc[ + lambda x : x['_merge']!='both'] + + different.loc[different['_merge']=='left_only','location'] = 'remote' + different.loc[different['_merge']=='right_only','location'] = 'local' + #changedfiles = different.copy() + index = different['date_created_x'].isnull() + different.loc[index,'date_created_x'] = different[index]['date_created_y'] + different = different[['workflow_spec_id','date_created_x','location']].copy() + different.columns=['workflow_spec_id','date_created','location'] + changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first() + remote_spec_ids = remote[['workflow_spec_id']] + local_spec_ids = local[['workflow_spec_id']] + + left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])] + right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])] + changedfiles['new'] = False + changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True + output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])] + return output.reset_index().to_dict(orient='records') + + + + def get_all_spec_state(): df = get_all_spec_state_dataframe() - return df.reset_index().to_json(orient='records') + return df.reset_index().to_dict(orient='records') def get_all_spec_state_dataframe(): """ From 10dce542ec612027ba2d2a8d995f4c48410708d0 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Thu, 3 Dec 2020 15:27:45 -0500 Subject: [PATCH 05/22] documentation change --- crc/api/workflow.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index d899d4fb..a65f6dda 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -304,6 +304,11 @@ def get_changed_workflows(remote): def get_all_spec_state(): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + Convert into a dict list from a dataframe + """ df = get_all_spec_state_dataframe() return df.reset_index().to_dict(orient='records') @@ -311,6 +316,7 @@ def get_all_spec_state_dataframe(): """ Return a list of all workflow specs along with last updated date and a thumbprint of all of the files that are used for that workflow_spec + Return a dataframe """ x = session.query(FileDataModel).join(FileModel) From 3e8d4ca7c942af25f5a3f8f0608417068462de5b Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 4 Dec 2020 10:23:03 -0500 Subject: [PATCH 06/22] Add some inline documentation to make the process more clear --- crc/api/workflow.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index a65f6dda..726aa78e 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -273,31 +273,50 @@ def get_changed_workflows(remote): gets a remote endpoint - gets the workflows and then determines what workflows are different from the remote endpoint """ - x = requests.get('http://'+remote+'/v1.0/workflow_spec/all') + response = requests.get('http://'+remote+'/v1.0/workflow_spec/all') # This is probably very and may allow cross site attacks - fix later - remote = pd.DataFrame(eval(x.text)) + remote = pd.DataFrame(json.loads(response.text)) + # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index local = get_all_spec_state_dataframe().reset_index() - different = remote.merge(local, right_on=['workflow_spec_id','md5_hash'], left_on=['workflow_spec_id', - 'md5_hash'], how = 'outer' , indicator=True).loc[ - lambda x : x['_merge']!='both'] + # merge these on workflow spec id and hash - this will + # make two different date columns date_x and date_y + different = remote.merge(local, + right_on=['workflow_spec_id','md5_hash'], + left_on=['workflow_spec_id','md5_hash'], + how = 'outer' , + indicator=True).loc[lambda x : x['_merge']!='both'] + + # each line has a tag on it - if was in the left or the right, + # label it so we know if that was on the remote or local machine different.loc[different['_merge']=='left_only','location'] = 'remote' different.loc[different['_merge']=='right_only','location'] = 'local' - #changedfiles = different.copy() + + # this takes the different date_created_x and date-created_y columns and + # combines them back into one date_created column index = different['date_created_x'].isnull() different.loc[index,'date_created_x'] = different[index]['date_created_y'] different = different[['workflow_spec_id','date_created_x','location']].copy() different.columns=['workflow_spec_id','date_created','location'] + + # our different list will have multiple entries for a workflow if there is a version on either side + # we want to grab the most recent one, so we sort and grab the most recent one for each workflow changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first() + + # get an exclusive or list of workflow ids - that is we want lists of files that are + # on one machine or the other, but not both remote_spec_ids = remote[['workflow_spec_id']] local_spec_ids = local[['workflow_spec_id']] - left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])] right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])] + + # flag files as new that are only on the remote box and remove the files that are only on the local box changedfiles['new'] = False changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])] + + # return the list as a dict, let swagger convert it to json return output.reset_index().to_dict(orient='records') @@ -339,9 +358,9 @@ def get_all_spec_state_dataframe(): # workflow spec df = df.groupby('workflow_spec_id').agg({'date_created':'max', 'md5_hash':join_uuids}).copy() + # get only the columns we are really interested in returning df = df[['date_created','md5_hash']].copy() + # convert dates to string df['date_created'] = df['date_created'].astype('str') - - return df From d41d018fe34fa91569c5c64225c69622514e0201 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 4 Dec 2020 11:49:07 -0500 Subject: [PATCH 07/22] For a given workflow - find the files that are different from a remote endpoint for the same workflow --- crc/api.yml | 61 +++++++++++++++++++++++++++++++++- crc/api/workflow.py | 81 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 2 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index 959d6061..afeb2bc2 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -126,10 +126,69 @@ paths: $ref: "#/components/schemas/Study" + /workflow_spec/{workflow_spec_id}/files: + get: + operationId: crc.api.workflow.get_workflow_spec_files + summary: Provides a list of workflow specs and their signature + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint + parameters: + - name: workflow_spec_id + in: path + required: true + description: The workflow_spec id + schema: + type: string + + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + + /workflow_spec/{workflow_spec_id}/files/diff: + get: + operationId: crc.api.workflow.get_changed_files + summary: Provides a list of workflow specs and their signature + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint + parameters: + - name: workflow_spec_id + in: path + required: true + description: The workflow_spec id + schema: + type: string + - name: remote + in: query + required: true + description: The remote endpoint + schema: + type: string + + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + + /workflow_spec/all: get: operationId: crc.api.workflow.get_all_spec_state - summary: Provides a list of workflow specs and their signature + summary: Provides a list of files for a workflow spec security: [] # Disable security for this endpoint only - we'll sanity check # in the endpoint tags: diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 726aa78e..9484d183 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -320,6 +320,53 @@ def get_changed_workflows(remote): return output.reset_index().to_dict(orient='records') +def get_changed_files(remote,workflow_spec_id): + """ + gets a remote endpoint - gets the files for a workflow_spec on both + local and remote and determines what files have been change and returns a list of those + files + """ + response = requests.get('http://'+remote+'/v1.0/workflow_spec/'+workflow_spec_id+'/files') + # This is probably very and may allow cross site attacks - fix later + remote = pd.DataFrame(json.loads(response.text)) + # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index + local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index() + local['md5_hash'] = local['md5_hash'].astype('str') + different = remote.merge(local, + right_on=['filename','md5_hash'], + left_on=['filename','md5_hash'], + how = 'outer' , + indicator=True).loc[lambda x : x['_merge']!='both'] + + # each line has a tag on it - if was in the left or the right, + # label it so we know if that was on the remote or local machine + different.loc[different['_merge']=='left_only','location'] = 'remote' + different.loc[different['_merge']=='right_only','location'] = 'local' + + # this takes the different date_created_x and date-created_y columns and + # combines them back into one date_created column + index = different['date_created_x'].isnull() + different.loc[index,'date_created_x'] = different[index]['date_created_y'] + different = different[['date_created_x','filename','location']].copy() + + different.columns=['date_created','filename','location'] + # our different list will have multiple entries for a workflow if there is a version on either side + # we want to grab the most recent one, so we sort and grab the most recent one for each workflow + changedfiles = different.sort_values('date_created',ascending=False).groupby('filename').first() + + # get an exclusive or list of workflow ids - that is we want lists of files that are + # on one machine or the other, but not both + remote_spec_ids = remote[['filename']] + local_spec_ids = local[['filename']] + left = remote_spec_ids[~remote_spec_ids['filename'].isin(local_spec_ids['filename'])] + right = local_spec_ids[~local_spec_ids['filename'].isin(remote_spec_ids['filename'])] + changedfiles['new'] = False + changedfiles.loc[changedfiles.index.isin(left['filename']), 'new'] = True + changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True + + # return the list as a dict, let swagger convert it to json + return changedfiles.reset_index().to_dict(orient='records') + def get_all_spec_state(): @@ -331,6 +378,39 @@ def get_all_spec_state(): df = get_all_spec_state_dataframe() return df.reset_index().to_dict(orient='records') + +def get_workflow_spec_files(workflow_spec_id): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + Convert into a dict list from a dataframe + """ + df = get_workflow_spec_files_dataframe(workflow_spec_id) + return df.reset_index().to_dict(orient='records') + + +def get_workflow_spec_files_dataframe(workflowid): + """ + Return a list of all files for a workflow_spec along with last updated date and a + hash so we can determine file differences for a changed workflow on a box. + Return a dataframe + """ + x = session.query(FileDataModel).join(FileModel).filter(FileModel.workflow_spec_id==workflowid) + # there might be a cleaner way of getting a data frome from some of the + # fields in the ORM - but this works OK + filelist = [] + for file in x: + filelist.append({'file_model_id':file.file_model_id, + 'workflow_spec_id': file.file_model.workflow_spec_id, + 'md5_hash':file.md5_hash, + 'filename':file.file_model.name, + 'date_created':file.date_created}) + df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last() + df['date_created'] = df['date_created'].astype('str') + return df + + + def get_all_spec_state_dataframe(): """ Return a list of all workflow specs along with last updated date and a @@ -338,7 +418,6 @@ def get_all_spec_state_dataframe(): Return a dataframe """ x = session.query(FileDataModel).join(FileModel) - # there might be a cleaner way of getting a data frome from some of the # fields in the ORM - but this works OK filelist = [] From cad613cf6395c5fd3fbfb03e5f6f14b205fe0294 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 4 Dec 2020 12:00:02 -0500 Subject: [PATCH 08/22] Fix problem when method is run for a workflow that is non-existant locally --- crc/api/workflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 9484d183..983097a6 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -405,6 +405,8 @@ def get_workflow_spec_files_dataframe(workflowid): 'md5_hash':file.md5_hash, 'filename':file.file_model.name, 'date_created':file.date_created}) + if len(filelist) == 0: + return pd.DataFrame(columns=['file_model_id','workflow_spec_id','md5_hash','filename','date_created']) df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last() df['date_created'] = df['date_created'].astype('str') return df From f26a8615a431dbd4a31e448cd29c3b4048a8a4af Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Mon, 7 Dec 2020 08:49:38 -0500 Subject: [PATCH 09/22] Get more file details so we can fill out everything locally and also add a method to download the file by md5_hash --- crc/api.yml | 23 +++++++++++++++++++++++ crc/api/file.py | 5 ++++- crc/api/workflow.py | 28 ++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index afeb2bc2..b35ad793 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -578,6 +578,29 @@ paths: responses: '204': description: The file has been removed. + /file/{md5_hash}/data: + parameters: + - name: md5_hash + in: path + required: true + description: The md5 hash of the file requested + schema: + type: string + get: + operationId: crc.api.file.get_file_data_by_hash + summary: Returns only the file contents + security: [] # Disable security for this endpoint only. + tags: + - Files + responses: + '200': + description: Returns the actual file + content: + application/octet-stream: + schema: + type: string + format: binary + example: '' /file/{file_id}/data: parameters: - name: file_id diff --git a/crc/api/file.py b/crc/api/file.py index 5cf54221..4f0b655f 100644 --- a/crc/api/file.py +++ b/crc/api/file.py @@ -6,7 +6,7 @@ from flask import send_file from crc import session from crc.api.common import ApiError -from crc.models.file import FileSchema, FileModel, File, FileModelSchema +from crc.models.file import FileSchema, FileModel, File, FileModelSchema, FileDataModel from crc.models.workflow import WorkflowSpecModel from crc.services.file_service import FileService @@ -99,6 +99,9 @@ def update_file_data(file_id): file_model = FileService.update_file(file_model, file.stream.read(), file.content_type) return FileSchema().dump(to_file_api(file_model)) +def get_file_data_by_hash(md5_hash): + filedatamodel = session.query(FileDataModel).filter(FileDataModel.md5_hash == md5_hash).first() + return get_file_data(filedatamodel.file_model_id) def get_file_data(file_id, version=None): file_data = FileService.get_file_data(file_id, version) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 983097a6..f9c6ac58 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -345,11 +345,15 @@ def get_changed_files(remote,workflow_spec_id): # this takes the different date_created_x and date-created_y columns and # combines them back into one date_created column - index = different['date_created_x'].isnull() - different.loc[index,'date_created_x'] = different[index]['date_created_y'] - different = different[['date_created_x','filename','location']].copy() + dualfields = ['date_created','type','primary','content_type','primary_process_id'] + for merge in dualfields: + index = different[merge+'_x'].isnull() + different.loc[index,merge+'_x'] = different[index][merge+'_y'] - different.columns=['date_created','filename','location'] + fieldlist = [fld+'_x' for fld in dualfields] + different = different[ fieldlist + ['md5_hash','filename','location']].copy() + + different.columns=dualfields+['md5_hash','filename','location'] # our different list will have multiple entries for a workflow if there is a version on either side # we want to grab the most recent one, so we sort and grab the most recent one for each workflow changedfiles = different.sort_values('date_created',ascending=False).groupby('filename').first() @@ -363,7 +367,7 @@ def get_changed_files(remote,workflow_spec_id): changedfiles['new'] = False changedfiles.loc[changedfiles.index.isin(left['filename']), 'new'] = True changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True - + changedfiles = changedfiles.replace({pd.np.nan: None}) # return the list as a dict, let swagger convert it to json return changedfiles.reset_index().to_dict(orient='records') @@ -404,9 +408,21 @@ def get_workflow_spec_files_dataframe(workflowid): 'workflow_spec_id': file.file_model.workflow_spec_id, 'md5_hash':file.md5_hash, 'filename':file.file_model.name, + 'type':file.file_model.type.name, + 'primary':file.file_model.primary, + 'content_type':file.file_model.content_type, + 'primary_process_id':file.file_model.primary_process_id, 'date_created':file.date_created}) if len(filelist) == 0: - return pd.DataFrame(columns=['file_model_id','workflow_spec_id','md5_hash','filename','date_created']) + return pd.DataFrame(columns=['file_model_id', + 'workflow_spec_id', + 'md5_hash', + 'filename', + 'type', + 'primary', + 'content_type', + 'primary_process_id', + 'date_created']) df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last() df['date_created'] = df['date_created'].astype('str') return df From 44c72115ae1cb2ff1134202485f112c55a349267 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Mon, 7 Dec 2020 08:50:20 -0500 Subject: [PATCH 10/22] Make sure we get the file we intended --- crc/api/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crc/api/file.py b/crc/api/file.py index 4f0b655f..861f9f04 100644 --- a/crc/api/file.py +++ b/crc/api/file.py @@ -101,7 +101,7 @@ def update_file_data(file_id): def get_file_data_by_hash(md5_hash): filedatamodel = session.query(FileDataModel).filter(FileDataModel.md5_hash == md5_hash).first() - return get_file_data(filedatamodel.file_model_id) + return get_file_data(filedatamodel.file_model_id,version=filedatamodel.version) def get_file_data(file_id, version=None): file_data = FileService.get_file_data(file_id, version) From 0e1aa59fa18373e87aeb15a09c031499e5124caa Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Tue, 8 Dec 2020 13:42:01 -0500 Subject: [PATCH 11/22] Make a change to make sure that if there is a new file locally that is not present remotely when we pull from the remote, the new local file gets deleted. Also: add several things to the requirements.txt that should have been there in the first place. --- crc/api.yml | 35 +++++++++++++++++++++++++- crc/api/workflow.py | 56 +++++++++++++++++++++++++++++++++++++++-- crc/models/workflow.py | 2 +- deploy/requirements.txt | 6 +++++ 4 files changed, 95 insertions(+), 4 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index b35ad793..3aa1f5d8 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -152,6 +152,38 @@ paths: items: $ref: "#/components/schemas/Study" + /workflow_spec/{workflow_spec_id}/files/sync: + get: + operationId: crc.api.workflow.sync_changed_files + summary: Provides a list of files that were updated + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint + parameters: + - name: workflow_spec_id + in: path + required: true + description: The workflow_spec id + schema: + type: string + - name: remote + in: query + required: true + description: The remote endpoint + schema: + type: string + + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + /workflow_spec/{workflow_spec_id}/files/diff: get: operationId: crc.api.workflow.get_changed_files @@ -334,6 +366,7 @@ paths: get: operationId: crc.api.workflow.get_workflow_specification summary: Returns a single workflow specification + security: [] tags: - Workflow Specifications responses: @@ -578,7 +611,7 @@ paths: responses: '204': description: The file has been removed. - /file/{md5_hash}/data: + /file/{md5_hash}/hash_data: parameters: - name: md5_hash in: path diff --git a/crc/api/workflow.py b/crc/api/workflow.py index f9c6ac58..fb0c1e4b 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -1,6 +1,7 @@ import hashlib import json import uuid +from io import StringIO from hashlib import md5 import pandas as pd @@ -319,8 +320,56 @@ def get_changed_workflows(remote): # return the list as a dict, let swagger convert it to json return output.reset_index().to_dict(orient='records') +def sync_all_changed_files(remote): + pass -def get_changed_files(remote,workflow_spec_id): +def sync_changed_files(remote,workflow_spec_id): + # make sure that spec is local before syncing files + remotespectext = requests.get('http://'+remote+'/v1.0/workflow-specification/'+workflow_spec_id) + specdict = json.loads(remotespectext.text) + localspec = session.query(WorkflowSpecModel).filter(WorkflowSpecModel.id == workflow_spec_id).first() + if localspec is None: + localspec = WorkflowSpecModel() + localspec.id = workflow_spec_id + if specdict['category'] == None: + localspec.category = None + else: + localspec.category = session.query(WorkflowSpecCategoryModel).filter(WorkflowSpecCategoryModel.id + == specdict['category']['id']).first() + localspec.display_order = specdict['display_order'] + localspec.display_name = specdict['display_name'] + localspec.name = specdict['name'] + localspec.description = specdict['description'] + session.add(localspec) + + changedfiles = get_changed_files(remote,workflow_spec_id,as_df=True) + updatefiles = changedfiles[~((changedfiles['new']==True) & (changedfiles['location']=='local'))] + deletefiles = changedfiles[((changedfiles['new']==True) & (changedfiles['location']=='local'))] + for delfile in deletefiles.reset_index().to_dict(orient='records'): + currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, + FileModel.name == delfile['filename']).first() + FileService.delete_file(currentfile.id) + for updatefile in updatefiles.reset_index().to_dict(orient='records'): + currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, + FileModel.name == updatefile['filename']).first() + if not currentfile: + currentfile = FileModel() + currentfile.name = updatefile['filename'] + currentfile.workflow_spec_id = workflow_spec_id + + currentfile.date_created = updatefile['date_created'] + currentfile.type = updatefile['type'] + currentfile.primary = updatefile['primary'] + currentfile.content_type = updatefile['content_type'] + currentfile.primary_process_id = updatefile['primary_process_id'] + session.add(currentfile) + + response = requests.get('http://'+remote+'/v1.0/file/'+updatefile['md5_hash']+'/hash_data') + FileService.update_file(currentfile,response.content,updatefile['type']) + session.commit() + + +def get_changed_files(remote,workflow_spec_id,as_df=False): """ gets a remote endpoint - gets the files for a workflow_spec on both local and remote and determines what files have been change and returns a list of those @@ -369,7 +418,10 @@ def get_changed_files(remote,workflow_spec_id): changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True changedfiles = changedfiles.replace({pd.np.nan: None}) # return the list as a dict, let swagger convert it to json - return changedfiles.reset_index().to_dict(orient='records') + if as_df: + return changedfiles + else: + return changedfiles.reset_index().to_dict(orient='records') diff --git a/crc/models/workflow.py b/crc/models/workflow.py index 718dfccf..0da32aec 100644 --- a/crc/models/workflow.py +++ b/crc/models/workflow.py @@ -69,7 +69,7 @@ class WorkflowStatus(enum.Enum): class WorkflowSpecDependencyFile(db.Model): - """Connects a workflow to the version of the specification files it depends on to execute""" + """Connects to a workflow to test the version of the specification files it depends on to execute""" file_data_id = db.Column(db.Integer, db.ForeignKey(FileDataModel.id), primary_key=True) workflow_id = db.Column(db.Integer, db.ForeignKey("workflow.id"), primary_key=True) diff --git a/deploy/requirements.txt b/deploy/requirements.txt index 420a888f..af827f27 100644 --- a/deploy/requirements.txt +++ b/deploy/requirements.txt @@ -22,8 +22,10 @@ docutils==0.16 docxtpl==0.9.2 et-xmlfile==1.0.1 flask==1.1.2 +flask-admin==1.5.7 flask-bcrypt==0.7.1 flask-cors==3.0.8 +flask-mail==0.9.1 flask-marshmallow==0.12.0 flask-migrate==2.5.3 flask-restful==0.3.8 @@ -55,17 +57,21 @@ pandas==1.0.3 psycopg2-binary==2.8.5 pyasn1==0.4.8 pycparser==2.20 +PyGithub==1.53 pygments==2.6.1 pyjwt==1.7.1 pyparsing==2.4.7 pyrsistent==0.16.0 +python-box==5.2.0 python-dateutil==2.8.1 python-docx==0.8.10 python-editor==1.0.4 +python-Levenshtein==0.12.0 pytz==2020.1 pyyaml==5.3.1 recommonmark==0.6.0 requests==2.23.0 +sentry-sdk==0.14.4 six==1.14.0 snowballstemmer==2.0.0 soupsieve==2.0.1 From c57b17df1ed1beabdde7084eaba72f25013b4870 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Wed, 9 Dec 2020 12:13:17 -0500 Subject: [PATCH 12/22] Add a robust way of adding an API key, update examples and documentation for swagger API and add the ability to completely sync the local system from the remote system. --- config/default.py | 8 ++ crc/api.yml | 177 +++++++++++++++++++++++++++++++++++++------- crc/api/workflow.py | 56 +++++++++++--- 3 files changed, 204 insertions(+), 37 deletions(-) diff --git a/config/default.py b/config/default.py index 13fc79ab..44651fd5 100644 --- a/config/default.py +++ b/config/default.py @@ -6,6 +6,14 @@ basedir = os.path.abspath(os.path.dirname(__file__)) JSON_SORT_KEYS = False # CRITICAL. Do not sort the data when returning values to the front end. +# The API_TOKEN is used to ensure that the +# workflow synch can work without a lot of +# back and forth. +# you may want to change this to something simple for testing!! +# NB, if you change this in the local endpoint, +# it needs to be changed in the remote endpoint as well +API_TOKEN = 'af95596f327c9ecc007b60414fc84b61' + NAME = "CR Connect Workflow" FLASK_PORT = environ.get('PORT0') or environ.get('FLASK_PORT', default="5000") CORS_ALLOW_ORIGINS = re.split(r',\s*', environ.get('CORS_ALLOW_ORIGINS', default="localhost:4200, localhost:5002")) diff --git a/crc/api.yml b/crc/api.yml index 3aa1f5d8..2789e249 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -100,11 +100,12 @@ paths: type: array items: $ref: "#/components/schemas/Study" - /workflow_spec/diff: + /workflow_spec/pullall: get: - operationId: crc.api.workflow.get_changed_workflows - summary: Provides a list of workflow specs and their signature - security: [] # Disable security for this endpoint only - we'll sanity check + operationId: crc.api.workflow.sync_all_changed_workflows + summary: Sync all workflows that have changed on the remote side and provide a list of the results + security: + - ApiKeyAuth : [] # in the endpoint parameters: - name: remote @@ -117,21 +118,51 @@ paths: - Workflow Spec States responses: '200': - description: An array of workflow specs, with last touched date and file signature. + description: An array of workflow specs that were synced from remote. content: application/json: schema: type: array items: - $ref: "#/components/schemas/Study" + type: string + example : ['top_level_workflow','3b495037-f7d4-4509-bf58-cee41c0c6b0e'] + + + + + /workflow_spec/diff: + get: + operationId: crc.api.workflow.get_changed_workflows + summary: Provides a list of workflow that differ from remote and if it is new or not + security : + - ApiKeyAuth : [] + # in the endpoint + parameters: + - name: remote + in: query + required: true + description: The remote endpoint + schema: + type: string + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and which one is most recent. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/WorkflowSpecDiffList" /workflow_spec/{workflow_spec_id}/files: get: operationId: crc.api.workflow.get_workflow_spec_files - summary: Provides a list of workflow specs and their signature - security: [] # Disable security for this endpoint only - we'll sanity check - # in the endpoint + summary: Provides a list of files for a workflow spec on this machine. + security : + - ApiKeyAuth : [] parameters: - name: workflow_spec_id in: path @@ -144,20 +175,20 @@ paths: - Workflow Spec States responses: '200': - description: An array of workflow specs, with last touched date and file signature. + description: An array of files for a workflow spec on the local system, with details. content: application/json: schema: type: array items: - $ref: "#/components/schemas/Study" + $ref: "#/components/schemas/WorkflowSpecFilesList" /workflow_spec/{workflow_spec_id}/files/sync: get: operationId: crc.api.workflow.sync_changed_files - summary: Provides a list of files that were updated - security: [] # Disable security for this endpoint only - we'll sanity check - # in the endpoint + summary: Syncs files from a workflow on a remote system and provides a list of files that were updated + security : + - ApiKeyAuth : [] parameters: - name: workflow_spec_id in: path @@ -176,20 +207,23 @@ paths: - Workflow Spec States responses: '200': - description: An array of workflow specs, with last touched date and file signature. + description: A list of files that were synced for the workflow. content: application/json: schema: type: array items: - $ref: "#/components/schemas/Study" + type : string + example : ["data_security_plan.dmn",'some_other_file.xml'] + /workflow_spec/{workflow_spec_id}/files/diff: get: operationId: crc.api.workflow.get_changed_files - summary: Provides a list of workflow specs and their signature - security: [] # Disable security for this endpoint only - we'll sanity check - # in the endpoint + summary: Provides a list of files for a workflow specs that differ from remote and their signature + security : + - ApiKeyAuth : [] + parameters: - name: workflow_spec_id in: path @@ -208,21 +242,22 @@ paths: - Workflow Spec States responses: '200': - description: An array of workflow specs, with last touched date and file signature. + description: An array of files that are different from remote, with last touched date and file signature. content: application/json: schema: type: array items: - $ref: "#/components/schemas/Study" + $ref: "#/components/schemas/WorkflowSpecFilesDiff" /workflow_spec/all: get: operationId: crc.api.workflow.get_all_spec_state - summary: Provides a list of files for a workflow spec - security: [] # Disable security for this endpoint only - we'll sanity check - # in the endpoint + summary: Provides a list of workflow specs, last update date and thumbprint + security: + - ApiKeyAuth : [] + tags: - Workflow Spec States responses: @@ -233,7 +268,7 @@ paths: schema: type: array items: - $ref: "#/components/schemas/Study" + $ref: "#/components/schemas/WorkflowSpecAll" /study/all: @@ -1314,6 +1349,12 @@ components: scheme: bearer bearerFormat: JWT x-bearerInfoFunc: crc.api.user.verify_token_admin + ApiKeyAuth : + type : apiKey + in : header + name : X-CR-API-KEY + x-apikeyInfoFunc: crc.api.workflow.verify_token + schemas: User: properties: @@ -1337,6 +1378,92 @@ components: properties: id: type: string + WorkflowSpecDiffList: + properties: + workflow_spec_id: + type: string + example : top_level_workflow + date_created : + type: string + example : 2020-12-09 16:55:12.951500+00:00 + location : + type : string + example : remote + new : + type : boolean + example : false + WorkflowSpecFilesList: + properties: + file_model_id: + type : integer + example : 171 + workflow_spec_id : + type: string + example : top_level_workflow + filename : + type: string + example : data_security_plan.dmn + date_created : + type: string + example : 2020-12-01 13:58:12.420333+00:00 + type: + type : string + example : dmn + primary : + type : boolean + example : false + content_type: + type: string + example : text/xml + primary_process_id: + type : string + example : null + md5_hash: + type: string + example: f12e2bbd-a20c-673b-ccb8-a8a1ea9c5b7b + + + WorkflowSpecFilesDiff: + properties: + filename : + type: string + example : data_security_plan.dmn + date_created : + type: string + example : 2020-12-01 13:58:12.420333+00:00 + type: + type : string + example : dmn + primary : + type : boolean + example : false + content_type: + type: string + example : text/xml + primary_process_id: + type : string + example : null + md5_hash: + type: string + example: f12e2bbd-a20c-673b-ccb8-a8a1ea9c5b7b + location: + type : string + example : remote + new: + type: boolean + example : false + + WorkflowSpecAll: + properties: + workflow_spec_id : + type: string + example : acaf1258-43b4-437e-8846-f612afa66811 + date_created : + type: string + example : 2020-12-01 13:58:12.420333+00:00 + md5_hash: + type: string + example: c30fd597f21715018eab12f97f9d4956 Study: properties: id: diff --git a/crc/api/workflow.py b/crc/api/workflow.py index fb0c1e4b..cc290435 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -7,7 +7,7 @@ from hashlib import md5 import pandas as pd from SpiffWorkflow.util.deep_merge import DeepMerge from flask import g -from crc import session, db +from crc import session, db, app from crc.api.common import ApiError, ApiErrorSchema from crc.models.api_models import WorkflowApi, WorkflowApiSchema, NavigationItem, NavigationItemSchema from crc.models.file import FileModel, LookupDataSchema, FileDataModel @@ -269,12 +269,19 @@ def join_uuids(uuids): # in the same order return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes -def get_changed_workflows(remote): +def verify_token(token, required_scopes): + if token == app.config['API_TOKEN']: + return {'scope':['any']} + else: + raise ApiError("permission_denied","API Token information is not correct") + + +def get_changed_workflows(remote,as_df=False): """ gets a remote endpoint - gets the workflows and then determines what workflows are different from the remote endpoint """ - response = requests.get('http://'+remote+'/v1.0/workflow_spec/all') + response = requests.get('http://'+remote+'/v1.0/workflow_spec/all',headers={'X-CR-API-KEY':app.config['API_TOKEN']}) # This is probably very and may allow cross site attacks - fix later remote = pd.DataFrame(json.loads(response.text)) @@ -318,14 +325,25 @@ def get_changed_workflows(remote): output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])] # return the list as a dict, let swagger convert it to json - return output.reset_index().to_dict(orient='records') + if as_df: + return output + else: + return output.reset_index().to_dict(orient='records') + + +def sync_all_changed_workflows(remote): + + workflowsdf = get_changed_workflows(remote,as_df=True) + workflows = workflowsdf.reset_index().to_dict(orient='records') + for workflow in workflows: + sync_changed_files(remote,workflow['workflow_spec_id']) + return [x['workflow_spec_id'] for x in workflows] -def sync_all_changed_files(remote): - pass def sync_changed_files(remote,workflow_spec_id): # make sure that spec is local before syncing files - remotespectext = requests.get('http://'+remote+'/v1.0/workflow-specification/'+workflow_spec_id) + remotespectext = requests.get('http://'+remote+'/v1.0/workflow-specification/'+workflow_spec_id, + headers={'X-CR-API-KEY': app.config['API_TOKEN']}) specdict = json.loads(remotespectext.text) localspec = session.query(WorkflowSpecModel).filter(WorkflowSpecModel.id == workflow_spec_id).first() if localspec is None: @@ -343,13 +361,20 @@ def sync_changed_files(remote,workflow_spec_id): session.add(localspec) changedfiles = get_changed_files(remote,workflow_spec_id,as_df=True) + if len(changedfiles)==0: + return [] updatefiles = changedfiles[~((changedfiles['new']==True) & (changedfiles['location']=='local'))] + updatefiles = updatefiles.reset_index().to_dict(orient='records') + deletefiles = changedfiles[((changedfiles['new']==True) & (changedfiles['location']=='local'))] - for delfile in deletefiles.reset_index().to_dict(orient='records'): + deletefiles = deletefiles.reset_index().to_dict(orient='records') + + for delfile in deletefiles: currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, FileModel.name == delfile['filename']).first() FileService.delete_file(currentfile.id) - for updatefile in updatefiles.reset_index().to_dict(orient='records'): + + for updatefile in updatefiles: currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, FileModel.name == updatefile['filename']).first() if not currentfile: @@ -364,9 +389,11 @@ def sync_changed_files(remote,workflow_spec_id): currentfile.primary_process_id = updatefile['primary_process_id'] session.add(currentfile) - response = requests.get('http://'+remote+'/v1.0/file/'+updatefile['md5_hash']+'/hash_data') + response = requests.get('http://'+remote+'/v1.0/file/'+updatefile['md5_hash']+'/hash_data', + headers={'X-CR-API-KEY': app.config['API_TOKEN']}) FileService.update_file(currentfile,response.content,updatefile['type']) session.commit() + return [x['filename'] for x in updatefiles] def get_changed_files(remote,workflow_spec_id,as_df=False): @@ -375,7 +402,8 @@ def get_changed_files(remote,workflow_spec_id,as_df=False): local and remote and determines what files have been change and returns a list of those files """ - response = requests.get('http://'+remote+'/v1.0/workflow_spec/'+workflow_spec_id+'/files') + response = requests.get('http://'+remote+'/v1.0/workflow_spec/'+workflow_spec_id+'/files', + headers={'X-CR-API-KEY':app.config['API_TOKEN']}) # This is probably very and may allow cross site attacks - fix later remote = pd.DataFrame(json.loads(response.text)) # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index @@ -386,7 +414,11 @@ def get_changed_files(remote,workflow_spec_id,as_df=False): left_on=['filename','md5_hash'], how = 'outer' , indicator=True).loc[lambda x : x['_merge']!='both'] - + if len(different) == 0: + if as_df: + return different + else: + return [] # each line has a tag on it - if was in the left or the right, # label it so we know if that was on the remote or local machine different.loc[different['_merge']=='left_only','location'] = 'remote' From e377a05deaec8a372a6c52a397a48baaff073ee5 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Wed, 9 Dec 2020 13:50:52 -0500 Subject: [PATCH 13/22] Add some punctuation --- crc/api.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crc/api.yml b/crc/api.yml index 2789e249..78436178 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -220,7 +220,7 @@ paths: /workflow_spec/{workflow_spec_id}/files/diff: get: operationId: crc.api.workflow.get_changed_files - summary: Provides a list of files for a workflow specs that differ from remote and their signature + summary: Provides a list of files for a workflow specs that differ from remote and their signature. security : - ApiKeyAuth : [] From a8203ed01ddd86a48904c22caabda5d3acc3718a Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Thu, 10 Dec 2020 10:06:21 -0500 Subject: [PATCH 14/22] save changes before refactor --- config/default.py | 2 +- crc/api/workflow.py | 20 +++++++++++++++++--- example_data.py | 1 + 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/config/default.py b/config/default.py index 44651fd5..1570811c 100644 --- a/config/default.py +++ b/config/default.py @@ -12,7 +12,7 @@ JSON_SORT_KEYS = False # CRITICAL. Do not sort the data when returning values # you may want to change this to something simple for testing!! # NB, if you change this in the local endpoint, # it needs to be changed in the remote endpoint as well -API_TOKEN = 'af95596f327c9ecc007b60414fc84b61' +API_TOKEN = environ.get('API_TOKEN', default = 'af95596f327c9ecc007b60414fc84b61') NAME = "CR Connect Workflow" FLASK_PORT = environ.get('PORT0') or environ.get('FLASK_PORT', default="5000") diff --git a/crc/api/workflow.py b/crc/api/workflow.py index cc290435..8da5c88a 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -352,8 +352,17 @@ def sync_changed_files(remote,workflow_spec_id): if specdict['category'] == None: localspec.category = None else: - localspec.category = session.query(WorkflowSpecCategoryModel).filter(WorkflowSpecCategoryModel.id - == specdict['category']['id']).first() + localcategory = session.query(WorkflowSpecCategoryModel).filter(WorkflowSpecCategoryModel.name + == specdict['category']['name']).first() + if localcategory == None: + #category doesn't exist - lets make it + localcategory = WorkflowSpecCategoryModel() + localcategory.name = specdict['category']['name'] + localcategory.display_name = specdict['category']['display_name'] + localcategory.display_order = specdict['category']['display_order'] + session.add(localcategory) + localspec.category = localcategory + localspec.display_order = specdict['display_order'] localspec.display_name = specdict['display_name'] localspec.name = specdict['name'] @@ -372,7 +381,12 @@ def sync_changed_files(remote,workflow_spec_id): for delfile in deletefiles: currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, FileModel.name == delfile['filename']).first() - FileService.delete_file(currentfile.id) + + # it is more appropriate to archive the file than delete + # due to the fact that we might have workflows that are using the + # file data + currentfile.archived = True + session.add(currentfile) for updatefile in updatefiles: currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, diff --git a/example_data.py b/example_data.py index d9b4c73b..a46112a2 100644 --- a/example_data.py +++ b/example_data.py @@ -67,6 +67,7 @@ class ExampleDataLoader: display_order=6 ), ] + db.session.execute("select setval('workflow_spec_category_id_seq',7);") db.session.add_all(categories) db.session.commit() From 3f56dfe48477677206b01e303084ab355ee4a077 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Thu, 10 Dec 2020 10:46:23 -0500 Subject: [PATCH 15/22] Move all workflow sync stuff into new file Make changes to api naming scheme add some error checking around endpoints for missing/invalid endpoints --- crc/api.yml | 52 +++--- crc/api/workflow.py | 305 ----------------------------------- crc/api/workflow_sync.py | 336 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 362 insertions(+), 331 deletions(-) create mode 100644 crc/api/workflow_sync.py diff --git a/crc/api.yml b/crc/api.yml index 78436178..41e76f90 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -100,9 +100,9 @@ paths: type: array items: $ref: "#/components/schemas/Study" - /workflow_spec/pullall: + /workflow_sync/pullall: get: - operationId: crc.api.workflow.sync_all_changed_workflows + operationId: crc.api.workflow_sync.sync_all_changed_workflows summary: Sync all workflows that have changed on the remote side and provide a list of the results security: - ApiKeyAuth : [] @@ -115,7 +115,7 @@ paths: schema: type: string tags: - - Workflow Spec States + - Workflow Sync API responses: '200': description: An array of workflow specs that were synced from remote. @@ -130,9 +130,9 @@ paths: - /workflow_spec/diff: + /workflow_sync/diff: get: - operationId: crc.api.workflow.get_changed_workflows + operationId: crc.api.workflow_sync.get_changed_workflows summary: Provides a list of workflow that differ from remote and if it is new or not security : - ApiKeyAuth : [] @@ -145,7 +145,7 @@ paths: schema: type: string tags: - - Workflow Spec States + - Workflow Sync API responses: '200': description: An array of workflow specs, with last touched date and which one is most recent. @@ -157,9 +157,9 @@ paths: $ref: "#/components/schemas/WorkflowSpecDiffList" - /workflow_spec/{workflow_spec_id}/files: + /workflow_sync/{workflow_spec_id}/files: get: - operationId: crc.api.workflow.get_workflow_spec_files + operationId: crc.api.workflow_sync.get_workflow_spec_files summary: Provides a list of files for a workflow spec on this machine. security : - ApiKeyAuth : [] @@ -172,7 +172,7 @@ paths: type: string tags: - - Workflow Spec States + - Workflow Sync API responses: '200': description: An array of files for a workflow spec on the local system, with details. @@ -183,9 +183,9 @@ paths: items: $ref: "#/components/schemas/WorkflowSpecFilesList" - /workflow_spec/{workflow_spec_id}/files/sync: + /workflow_sync/{workflow_spec_id}/files/sync: get: - operationId: crc.api.workflow.sync_changed_files + operationId: crc.api.workflow_sync.sync_changed_files summary: Syncs files from a workflow on a remote system and provides a list of files that were updated security : - ApiKeyAuth : [] @@ -204,7 +204,7 @@ paths: type: string tags: - - Workflow Spec States + - Workflow Sync API responses: '200': description: A list of files that were synced for the workflow. @@ -217,9 +217,9 @@ paths: example : ["data_security_plan.dmn",'some_other_file.xml'] - /workflow_spec/{workflow_spec_id}/files/diff: + /workflow_sync/{workflow_spec_id}/files/diff: get: - operationId: crc.api.workflow.get_changed_files + operationId: crc.api.workflow_sync.get_changed_files summary: Provides a list of files for a workflow specs that differ from remote and their signature. security : - ApiKeyAuth : [] @@ -239,7 +239,7 @@ paths: type: string tags: - - Workflow Spec States + - Workflow Sync API responses: '200': description: An array of files that are different from remote, with last touched date and file signature. @@ -251,15 +251,15 @@ paths: $ref: "#/components/schemas/WorkflowSpecFilesDiff" - /workflow_spec/all: + /workflow_sync/all: get: - operationId: crc.api.workflow.get_all_spec_state + operationId: crc.api.workflow_sync.get_all_spec_state summary: Provides a list of workflow specs, last update date and thumbprint security: - ApiKeyAuth : [] tags: - - Workflow Spec States + - Workflow Sync API responses: '200': description: An array of workflow specs, with last touched date and file signature. @@ -547,7 +547,7 @@ paths: description: The workflow spec category has been removed. /file: parameters: - - name: workflow_spec_id + - name: workflow_sync_id in: query required: false description: The unique id of a workflow specification @@ -1353,7 +1353,7 @@ components: type : apiKey in : header name : X-CR-API-KEY - x-apikeyInfoFunc: crc.api.workflow.verify_token + x-apikeyInfoFunc: crc.api.workflow_sync.verify_token schemas: User: @@ -1380,7 +1380,7 @@ components: type: string WorkflowSpecDiffList: properties: - workflow_spec_id: + workflow_sync_id: type: string example : top_level_workflow date_created : @@ -1397,7 +1397,7 @@ components: file_model_id: type : integer example : 171 - workflow_spec_id : + workflow_sync_id : type: string example : top_level_workflow filename : @@ -1455,7 +1455,7 @@ components: WorkflowSpecAll: properties: - workflow_spec_id : + workflow_sync_id : type: string example : acaf1258-43b4-437e-8846-f612afa66811 date_created : @@ -1587,7 +1587,7 @@ components: content_type: type: string example: "application/xml" - workflow_spec_id: + workflow_sync_id: type: string example: "random_fact" x-nullable: true @@ -1609,7 +1609,7 @@ components: $ref: "#/components/schemas/NavigationItem" next_task: $ref: "#/components/schemas/Task" - workflow_spec_id: + workflow_sync_id: type: string spec_version: type: string @@ -1625,7 +1625,7 @@ components: example: id: 291234 status: 'user_input_required' - workflow_spec_id: 'random_fact' + workflow_sync_id: 'random_fact' spec_version: 'v1.1 [22,23]' is_latest_spec: True next_task: diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 8da5c88a..d14daf11 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -1,11 +1,4 @@ -import hashlib -import json import uuid -from io import StringIO -from hashlib import md5 - -import pandas as pd -from SpiffWorkflow.util.deep_merge import DeepMerge from flask import g from crc import session, db, app from crc.api.common import ApiError, ApiErrorSchema @@ -21,8 +14,6 @@ from crc.services.study_service import StudyService from crc.services.user_service import UserService from crc.services.workflow_processor import WorkflowProcessor from crc.services.workflow_service import WorkflowService -from flask_cors import cross_origin -import requests def all_specifications(): schema = WorkflowSpecModelSchema(many=True) @@ -263,299 +254,3 @@ def _verify_user_and_role(processor, spiff_task): raise ApiError.from_task("permission_denied", f"This task must be completed by '{allowed_users}', " f"but you are {user.uid}", spiff_task) -def join_uuids(uuids): - """Joins a pandas Series of uuids and combines them in one hash""" - combined_uuids = ''.join([str(uuid) for uuid in uuids.sort_values()]) # ensure that values are always - # in the same order - return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes - -def verify_token(token, required_scopes): - if token == app.config['API_TOKEN']: - return {'scope':['any']} - else: - raise ApiError("permission_denied","API Token information is not correct") - - -def get_changed_workflows(remote,as_df=False): - """ - gets a remote endpoint - gets the workflows and then - determines what workflows are different from the remote endpoint - """ - response = requests.get('http://'+remote+'/v1.0/workflow_spec/all',headers={'X-CR-API-KEY':app.config['API_TOKEN']}) - - # This is probably very and may allow cross site attacks - fix later - remote = pd.DataFrame(json.loads(response.text)) - # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index - local = get_all_spec_state_dataframe().reset_index() - - # merge these on workflow spec id and hash - this will - # make two different date columns date_x and date_y - different = remote.merge(local, - right_on=['workflow_spec_id','md5_hash'], - left_on=['workflow_spec_id','md5_hash'], - how = 'outer' , - indicator=True).loc[lambda x : x['_merge']!='both'] - - # each line has a tag on it - if was in the left or the right, - # label it so we know if that was on the remote or local machine - different.loc[different['_merge']=='left_only','location'] = 'remote' - different.loc[different['_merge']=='right_only','location'] = 'local' - - # this takes the different date_created_x and date-created_y columns and - # combines them back into one date_created column - index = different['date_created_x'].isnull() - different.loc[index,'date_created_x'] = different[index]['date_created_y'] - different = different[['workflow_spec_id','date_created_x','location']].copy() - different.columns=['workflow_spec_id','date_created','location'] - - # our different list will have multiple entries for a workflow if there is a version on either side - # we want to grab the most recent one, so we sort and grab the most recent one for each workflow - changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first() - - # get an exclusive or list of workflow ids - that is we want lists of files that are - # on one machine or the other, but not both - remote_spec_ids = remote[['workflow_spec_id']] - local_spec_ids = local[['workflow_spec_id']] - left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])] - right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])] - - # flag files as new that are only on the remote box and remove the files that are only on the local box - changedfiles['new'] = False - changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True - output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])] - - # return the list as a dict, let swagger convert it to json - if as_df: - return output - else: - return output.reset_index().to_dict(orient='records') - - -def sync_all_changed_workflows(remote): - - workflowsdf = get_changed_workflows(remote,as_df=True) - workflows = workflowsdf.reset_index().to_dict(orient='records') - for workflow in workflows: - sync_changed_files(remote,workflow['workflow_spec_id']) - return [x['workflow_spec_id'] for x in workflows] - - -def sync_changed_files(remote,workflow_spec_id): - # make sure that spec is local before syncing files - remotespectext = requests.get('http://'+remote+'/v1.0/workflow-specification/'+workflow_spec_id, - headers={'X-CR-API-KEY': app.config['API_TOKEN']}) - specdict = json.loads(remotespectext.text) - localspec = session.query(WorkflowSpecModel).filter(WorkflowSpecModel.id == workflow_spec_id).first() - if localspec is None: - localspec = WorkflowSpecModel() - localspec.id = workflow_spec_id - if specdict['category'] == None: - localspec.category = None - else: - localcategory = session.query(WorkflowSpecCategoryModel).filter(WorkflowSpecCategoryModel.name - == specdict['category']['name']).first() - if localcategory == None: - #category doesn't exist - lets make it - localcategory = WorkflowSpecCategoryModel() - localcategory.name = specdict['category']['name'] - localcategory.display_name = specdict['category']['display_name'] - localcategory.display_order = specdict['category']['display_order'] - session.add(localcategory) - localspec.category = localcategory - - localspec.display_order = specdict['display_order'] - localspec.display_name = specdict['display_name'] - localspec.name = specdict['name'] - localspec.description = specdict['description'] - session.add(localspec) - - changedfiles = get_changed_files(remote,workflow_spec_id,as_df=True) - if len(changedfiles)==0: - return [] - updatefiles = changedfiles[~((changedfiles['new']==True) & (changedfiles['location']=='local'))] - updatefiles = updatefiles.reset_index().to_dict(orient='records') - - deletefiles = changedfiles[((changedfiles['new']==True) & (changedfiles['location']=='local'))] - deletefiles = deletefiles.reset_index().to_dict(orient='records') - - for delfile in deletefiles: - currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, - FileModel.name == delfile['filename']).first() - - # it is more appropriate to archive the file than delete - # due to the fact that we might have workflows that are using the - # file data - currentfile.archived = True - session.add(currentfile) - - for updatefile in updatefiles: - currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, - FileModel.name == updatefile['filename']).first() - if not currentfile: - currentfile = FileModel() - currentfile.name = updatefile['filename'] - currentfile.workflow_spec_id = workflow_spec_id - - currentfile.date_created = updatefile['date_created'] - currentfile.type = updatefile['type'] - currentfile.primary = updatefile['primary'] - currentfile.content_type = updatefile['content_type'] - currentfile.primary_process_id = updatefile['primary_process_id'] - session.add(currentfile) - - response = requests.get('http://'+remote+'/v1.0/file/'+updatefile['md5_hash']+'/hash_data', - headers={'X-CR-API-KEY': app.config['API_TOKEN']}) - FileService.update_file(currentfile,response.content,updatefile['type']) - session.commit() - return [x['filename'] for x in updatefiles] - - -def get_changed_files(remote,workflow_spec_id,as_df=False): - """ - gets a remote endpoint - gets the files for a workflow_spec on both - local and remote and determines what files have been change and returns a list of those - files - """ - response = requests.get('http://'+remote+'/v1.0/workflow_spec/'+workflow_spec_id+'/files', - headers={'X-CR-API-KEY':app.config['API_TOKEN']}) - # This is probably very and may allow cross site attacks - fix later - remote = pd.DataFrame(json.loads(response.text)) - # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index - local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index() - local['md5_hash'] = local['md5_hash'].astype('str') - different = remote.merge(local, - right_on=['filename','md5_hash'], - left_on=['filename','md5_hash'], - how = 'outer' , - indicator=True).loc[lambda x : x['_merge']!='both'] - if len(different) == 0: - if as_df: - return different - else: - return [] - # each line has a tag on it - if was in the left or the right, - # label it so we know if that was on the remote or local machine - different.loc[different['_merge']=='left_only','location'] = 'remote' - different.loc[different['_merge']=='right_only','location'] = 'local' - - # this takes the different date_created_x and date-created_y columns and - # combines them back into one date_created column - dualfields = ['date_created','type','primary','content_type','primary_process_id'] - for merge in dualfields: - index = different[merge+'_x'].isnull() - different.loc[index,merge+'_x'] = different[index][merge+'_y'] - - fieldlist = [fld+'_x' for fld in dualfields] - different = different[ fieldlist + ['md5_hash','filename','location']].copy() - - different.columns=dualfields+['md5_hash','filename','location'] - # our different list will have multiple entries for a workflow if there is a version on either side - # we want to grab the most recent one, so we sort and grab the most recent one for each workflow - changedfiles = different.sort_values('date_created',ascending=False).groupby('filename').first() - - # get an exclusive or list of workflow ids - that is we want lists of files that are - # on one machine or the other, but not both - remote_spec_ids = remote[['filename']] - local_spec_ids = local[['filename']] - left = remote_spec_ids[~remote_spec_ids['filename'].isin(local_spec_ids['filename'])] - right = local_spec_ids[~local_spec_ids['filename'].isin(remote_spec_ids['filename'])] - changedfiles['new'] = False - changedfiles.loc[changedfiles.index.isin(left['filename']), 'new'] = True - changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True - changedfiles = changedfiles.replace({pd.np.nan: None}) - # return the list as a dict, let swagger convert it to json - if as_df: - return changedfiles - else: - return changedfiles.reset_index().to_dict(orient='records') - - - -def get_all_spec_state(): - """ - Return a list of all workflow specs along with last updated date and a - thumbprint of all of the files that are used for that workflow_spec - Convert into a dict list from a dataframe - """ - df = get_all_spec_state_dataframe() - return df.reset_index().to_dict(orient='records') - - -def get_workflow_spec_files(workflow_spec_id): - """ - Return a list of all workflow specs along with last updated date and a - thumbprint of all of the files that are used for that workflow_spec - Convert into a dict list from a dataframe - """ - df = get_workflow_spec_files_dataframe(workflow_spec_id) - return df.reset_index().to_dict(orient='records') - - -def get_workflow_spec_files_dataframe(workflowid): - """ - Return a list of all files for a workflow_spec along with last updated date and a - hash so we can determine file differences for a changed workflow on a box. - Return a dataframe - """ - x = session.query(FileDataModel).join(FileModel).filter(FileModel.workflow_spec_id==workflowid) - # there might be a cleaner way of getting a data frome from some of the - # fields in the ORM - but this works OK - filelist = [] - for file in x: - filelist.append({'file_model_id':file.file_model_id, - 'workflow_spec_id': file.file_model.workflow_spec_id, - 'md5_hash':file.md5_hash, - 'filename':file.file_model.name, - 'type':file.file_model.type.name, - 'primary':file.file_model.primary, - 'content_type':file.file_model.content_type, - 'primary_process_id':file.file_model.primary_process_id, - 'date_created':file.date_created}) - if len(filelist) == 0: - return pd.DataFrame(columns=['file_model_id', - 'workflow_spec_id', - 'md5_hash', - 'filename', - 'type', - 'primary', - 'content_type', - 'primary_process_id', - 'date_created']) - df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last() - df['date_created'] = df['date_created'].astype('str') - return df - - - -def get_all_spec_state_dataframe(): - """ - Return a list of all workflow specs along with last updated date and a - thumbprint of all of the files that are used for that workflow_spec - Return a dataframe - """ - x = session.query(FileDataModel).join(FileModel) - # there might be a cleaner way of getting a data frome from some of the - # fields in the ORM - but this works OK - filelist = [] - for file in x: - filelist.append({'file_model_id':file.file_model_id, - 'workflow_spec_id': file.file_model.workflow_spec_id, - 'md5_hash':file.md5_hash, - 'filename':file.file_model.name, - 'date_created':file.date_created}) - df = pd.DataFrame(filelist) - - # get a distinct list of file_model_id's with the most recent file_data retained - df = df.sort_values('date_created').drop_duplicates(['file_model_id'],keep='last').copy() - - # take that list and then group by workflow_spec and retain the most recently touched file - # and make a consolidated hash of the md5_checksums - this acts as a 'thumbprint' for each - # workflow spec - df = df.groupby('workflow_spec_id').agg({'date_created':'max', - 'md5_hash':join_uuids}).copy() - # get only the columns we are really interested in returning - df = df[['date_created','md5_hash']].copy() - # convert dates to string - df['date_created'] = df['date_created'].astype('str') - return df - diff --git a/crc/api/workflow_sync.py b/crc/api/workflow_sync.py new file mode 100644 index 00000000..98f04236 --- /dev/null +++ b/crc/api/workflow_sync.py @@ -0,0 +1,336 @@ +import hashlib +import json +import pandas as pd +import requests +from crc import session, app +from crc.api.common import ApiError +from crc.models.file import FileModel, FileDataModel +from crc.models.workflow import WorkflowSpecModel, WorkflowSpecCategoryModel +from crc.services.file_service import FileService + + +def join_uuids(uuids): + """Joins a pandas Series of uuids and combines them in one hash""" + combined_uuids = ''.join([str(uuid) for uuid in uuids.sort_values()]) # ensure that values are always + # in the same order + return hashlib.md5(combined_uuids.encode('utf8')).hexdigest() # make a hash of the hashes + +def verify_token(token, required_scopes): + if token == app.config['API_TOKEN']: + return {'scope':['any']} + else: + raise ApiError("permission_denied", "API Token information is not correct") + + +def get_changed_workflows(remote,as_df=False): + """ + gets a remote endpoint - gets the workflows and then + determines what workflows are different from the remote endpoint + """ + try: + response = requests.get('http://'+remote+'/v1.0/workflow_sync/all',headers={'X-CR-API-KEY':app.config['API_TOKEN']}) + except: + raise ApiError("endpoint error", 'had a problem connecting to '+remote) + if response.status_code != 200: + raise ApiError("endpoint error", response.text) + + remote = pd.DataFrame(json.loads(response.text)) + + # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index + local = get_all_spec_state_dataframe().reset_index() + + # merge these on workflow spec id and hash - this will + # make two different date columns date_x and date_y + different = remote.merge(local, + right_on=['workflow_spec_id','md5_hash'], + left_on=['workflow_spec_id','md5_hash'], + how = 'outer' , + indicator=True).loc[lambda x : x['_merge']!='both'] + if len(different)==0: + return [] + # each line has a tag on it - if was in the left or the right, + # label it so we know if that was on the remote or local machine + different.loc[different['_merge']=='left_only','location'] = 'remote' + different.loc[different['_merge']=='right_only','location'] = 'local' + + # this takes the different date_created_x and date-created_y columns and + # combines them back into one date_created column + index = different['date_created_x'].isnull() + different.loc[index,'date_created_x'] = different[index]['date_created_y'] + different = different[['workflow_spec_id','date_created_x','location']].copy() + different.columns=['workflow_spec_id','date_created','location'] + + # our different list will have multiple entries for a workflow if there is a version on either side + # we want to grab the most recent one, so we sort and grab the most recent one for each workflow + changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first() + + # get an exclusive or list of workflow ids - that is we want lists of files that are + # on one machine or the other, but not both + remote_spec_ids = remote[['workflow_spec_id']] + local_spec_ids = local[['workflow_spec_id']] + left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])] + right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])] + + # flag files as new that are only on the remote box and remove the files that are only on the local box + changedfiles['new'] = False + changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True + output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])] + + # return the list as a dict, let swagger convert it to json + if as_df: + return output + else: + return output.reset_index().to_dict(orient='records') + + +def sync_all_changed_workflows(remote): + + workflowsdf = get_changed_workflows(remote,as_df=True) + if len(workflowsdf) ==0: + return [] + workflows = workflowsdf.reset_index().to_dict(orient='records') + for workflow in workflows: + sync_changed_files(remote,workflow['workflow_spec_id']) + return [x['workflow_spec_id'] for x in workflows] + + +def sync_changed_files(remote,workflow_spec_id): + # make sure that spec is local before syncing files + try: + remotespectext = requests.get('http://'+remote+'/v1.0/workflow-specification/'+workflow_spec_id, + headers={'X-CR-API-KEY': app.config['API_TOKEN']}) + except: + raise ApiError("endpoint error", 'had a problem connecting to '+remote) + + if remotespectext.status_code != 200: + raise ApiError("endpoint error", response.text) + + specdict = json.loads(remotespectext.text) + + localspec = session.query(WorkflowSpecModel).filter(WorkflowSpecModel.id == workflow_spec_id).first() + if localspec is None: + localspec = WorkflowSpecModel() + localspec.id = workflow_spec_id + if specdict['category'] == None: + localspec.category = None + else: + localcategory = session.query(WorkflowSpecCategoryModel).filter(WorkflowSpecCategoryModel.name + == specdict['category']['name']).first() + if localcategory == None: + #category doesn't exist - lets make it + localcategory = WorkflowSpecCategoryModel() + localcategory.name = specdict['category']['name'] + localcategory.display_name = specdict['category']['display_name'] + localcategory.display_order = specdict['category']['display_order'] + session.add(localcategory) + localspec.category = localcategory + + localspec.display_order = specdict['display_order'] + localspec.display_name = specdict['display_name'] + localspec.name = specdict['name'] + localspec.description = specdict['description'] + session.add(localspec) + + changedfiles = get_changed_files(remote,workflow_spec_id,as_df=True) + if len(changedfiles)==0: + return [] + updatefiles = changedfiles[~((changedfiles['new']==True) & (changedfiles['location']=='local'))] + updatefiles = updatefiles.reset_index().to_dict(orient='records') + + deletefiles = changedfiles[((changedfiles['new']==True) & (changedfiles['location']=='local'))] + deletefiles = deletefiles.reset_index().to_dict(orient='records') + + for delfile in deletefiles: + currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, + FileModel.name == delfile['filename']).first() + + # it is more appropriate to archive the file than delete + # due to the fact that we might have workflows that are using the + # file data + currentfile.archived = True + session.add(currentfile) + + for updatefile in updatefiles: + currentfile = session.query(FileModel).filter(FileModel.workflow_spec_id==workflow_spec_id, + FileModel.name == updatefile['filename']).first() + if not currentfile: + currentfile = FileModel() + currentfile.name = updatefile['filename'] + currentfile.workflow_spec_id = workflow_spec_id + + currentfile.date_created = updatefile['date_created'] + currentfile.type = updatefile['type'] + currentfile.primary = updatefile['primary'] + currentfile.content_type = updatefile['content_type'] + currentfile.primary_process_id = updatefile['primary_process_id'] + session.add(currentfile) + try: + response = requests.get('http://'+remote+'/v1.0/file/'+updatefile['md5_hash']+'/hash_data', + headers={'X-CR-API-KEY': app.config['API_TOKEN']}) + except: + raise ApiError("endpoint error", 'had a problem connecting to ' + remote) + + if response.status_code != 200: + raise ApiError("endpoint error", response.text) + + FileService.update_file(currentfile,response.content,updatefile['type']) + session.commit() + return [x['filename'] for x in updatefiles] + + +def get_changed_files(remote,workflow_spec_id,as_df=False): + """ + gets a remote endpoint - gets the files for a workflow_spec on both + local and remote and determines what files have been change and returns a list of those + files + """ + try: + response = requests.get('http://'+remote+'/v1.0/workflow_sync/'+workflow_spec_id+'/files', + headers={'X-CR-API-KEY':app.config['API_TOKEN']}) + except: + raise ApiError("endpoint error", 'had a problem connecting to '+remote) + + if response.status_code != 200: + raise ApiError("endpoint error", response.text) + + # This is probably very and may allow cross site attacks - fix later + remote = pd.DataFrame(json.loads(response.text)) + # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index + local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index() + local['md5_hash'] = local['md5_hash'].astype('str') + different = remote.merge(local, + right_on=['filename','md5_hash'], + left_on=['filename','md5_hash'], + how = 'outer' , + indicator=True).loc[lambda x : x['_merge']!='both'] + if len(different) == 0: + if as_df: + return different + else: + return [] + # each line has a tag on it - if was in the left or the right, + # label it so we know if that was on the remote or local machine + different.loc[different['_merge']=='left_only','location'] = 'remote' + different.loc[different['_merge']=='right_only','location'] = 'local' + + # this takes the different date_created_x and date-created_y columns and + # combines them back into one date_created column + dualfields = ['date_created','type','primary','content_type','primary_process_id'] + for merge in dualfields: + index = different[merge+'_x'].isnull() + different.loc[index,merge+'_x'] = different[index][merge+'_y'] + + fieldlist = [fld+'_x' for fld in dualfields] + different = different[ fieldlist + ['md5_hash','filename','location']].copy() + + different.columns=dualfields+['md5_hash','filename','location'] + # our different list will have multiple entries for a workflow if there is a version on either side + # we want to grab the most recent one, so we sort and grab the most recent one for each workflow + changedfiles = different.sort_values('date_created',ascending=False).groupby('filename').first() + + # get an exclusive or list of workflow ids - that is we want lists of files that are + # on one machine or the other, but not both + remote_spec_ids = remote[['filename']] + local_spec_ids = local[['filename']] + left = remote_spec_ids[~remote_spec_ids['filename'].isin(local_spec_ids['filename'])] + right = local_spec_ids[~local_spec_ids['filename'].isin(remote_spec_ids['filename'])] + changedfiles['new'] = False + changedfiles.loc[changedfiles.index.isin(left['filename']), 'new'] = True + changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True + changedfiles = changedfiles.replace({pd.np.nan: None}) + # return the list as a dict, let swagger convert it to json + if as_df: + return changedfiles + else: + return changedfiles.reset_index().to_dict(orient='records') + + + +def get_all_spec_state(): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + Convert into a dict list from a dataframe + """ + df = get_all_spec_state_dataframe() + return df.reset_index().to_dict(orient='records') + + +def get_workflow_spec_files(workflow_spec_id): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + Convert into a dict list from a dataframe + """ + df = get_workflow_spec_files_dataframe(workflow_spec_id) + return df.reset_index().to_dict(orient='records') + + +def get_workflow_spec_files_dataframe(workflowid): + """ + Return a list of all files for a workflow_spec along with last updated date and a + hash so we can determine file differences for a changed workflow on a box. + Return a dataframe + """ + x = session.query(FileDataModel).join(FileModel).filter(FileModel.workflow_spec_id==workflowid) + # there might be a cleaner way of getting a data frome from some of the + # fields in the ORM - but this works OK + filelist = [] + for file in x: + filelist.append({'file_model_id':file.file_model_id, + 'workflow_spec_id': file.file_model.workflow_spec_id, + 'md5_hash':file.md5_hash, + 'filename':file.file_model.name, + 'type':file.file_model.type.name, + 'primary':file.file_model.primary, + 'content_type':file.file_model.content_type, + 'primary_process_id':file.file_model.primary_process_id, + 'date_created':file.date_created}) + if len(filelist) == 0: + return pd.DataFrame(columns=['file_model_id', + 'workflow_spec_id', + 'md5_hash', + 'filename', + 'type', + 'primary', + 'content_type', + 'primary_process_id', + 'date_created']) + df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last() + df['date_created'] = df['date_created'].astype('str') + return df + + + +def get_all_spec_state_dataframe(): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + Return a dataframe + """ + x = session.query(FileDataModel).join(FileModel) + # there might be a cleaner way of getting a data frome from some of the + # fields in the ORM - but this works OK + filelist = [] + for file in x: + filelist.append({'file_model_id':file.file_model_id, + 'workflow_spec_id': file.file_model.workflow_spec_id, + 'md5_hash':file.md5_hash, + 'filename':file.file_model.name, + 'date_created':file.date_created}) + df = pd.DataFrame(filelist) + + # get a distinct list of file_model_id's with the most recent file_data retained + df = df.sort_values('date_created').drop_duplicates(['file_model_id'],keep='last').copy() + + # take that list and then group by workflow_spec and retain the most recently touched file + # and make a consolidated hash of the md5_checksums - this acts as a 'thumbprint' for each + # workflow spec + df = df.groupby('workflow_spec_id').agg({'date_created':'max', + 'md5_hash':join_uuids}).copy() + # get only the columns we are really interested in returning + df = df[['date_created','md5_hash']].copy() + # convert dates to string + df['date_created'] = df['date_created'].astype('str') + return df + From 993c7bc76e6480d4d9e0ef6fa3ea41f2db4fafe1 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 11 Dec 2020 08:29:37 -0500 Subject: [PATCH 16/22] fixed error on api.yml from search / replace --- crc/api.yml | 14 +++---- deploy/requirements.txt | 93 +++++++++++++++++++---------------------- 2 files changed, 51 insertions(+), 56 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index 41e76f90..ce02307f 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -547,7 +547,7 @@ paths: description: The workflow spec category has been removed. /file: parameters: - - name: workflow_sync_id + - name: workflow_spec_id in: query required: false description: The unique id of a workflow specification @@ -1380,7 +1380,7 @@ components: type: string WorkflowSpecDiffList: properties: - workflow_sync_id: + workflow_spec_id: type: string example : top_level_workflow date_created : @@ -1397,7 +1397,7 @@ components: file_model_id: type : integer example : 171 - workflow_sync_id : + workflow_spec_id : type: string example : top_level_workflow filename : @@ -1455,7 +1455,7 @@ components: WorkflowSpecAll: properties: - workflow_sync_id : + workflow_spec_id : type: string example : acaf1258-43b4-437e-8846-f612afa66811 date_created : @@ -1587,7 +1587,7 @@ components: content_type: type: string example: "application/xml" - workflow_sync_id: + workflow_spec_id: type: string example: "random_fact" x-nullable: true @@ -1609,7 +1609,7 @@ components: $ref: "#/components/schemas/NavigationItem" next_task: $ref: "#/components/schemas/Task" - workflow_sync_id: + workflow_spec_id: type: string spec_version: type: string @@ -1625,7 +1625,7 @@ components: example: id: 291234 status: 'user_input_required' - workflow_sync_id: 'random_fact' + workflow_spec_id: 'random_fact' spec_version: 'v1.1 [22,23]' is_latest_spec: True next_task: diff --git a/deploy/requirements.txt b/deploy/requirements.txt index af827f27..b9ab7b4c 100644 --- a/deploy/requirements.txt +++ b/deploy/requirements.txt @@ -1,81 +1,76 @@ alabaster==0.7.12 -alembic==1.4.2 -amqp==2.5.2 +alembic==1.4.3 aniso8601==8.0.0 -attrs==19.3.0 -babel==2.8.0 -bcrypt==3.1.7 -beautifulsoup4==4.9.1 -billiard==3.6.3.0 +attrs==20.3.0 +babel==2.9.0 +bcrypt==3.2.0 +beautifulsoup4==4.9.3 blinker==1.4 -celery==4.4.2 -certifi==2020.4.5.1 -cffi==1.14.0 +certifi==2020.11.8 +cffi==1.14.4 chardet==3.0.4 click==7.1.2 -clickclick==1.2.2 +clickclick==20.10.2 commonmark==0.9.1 -configparser==5.0.0 connexion==2.7.0 -coverage==5.1 +coverage==5.3 +deprecated==1.2.10 docutils==0.16 -docxtpl==0.9.2 +docxtpl==0.11.2 et-xmlfile==1.0.1 flask==1.1.2 flask-admin==1.5.7 flask-bcrypt==0.7.1 -flask-cors==3.0.8 +flask-cors==3.0.9 flask-mail==0.9.1 -flask-marshmallow==0.12.0 +flask-marshmallow==0.14.0 flask-migrate==2.5.3 flask-restful==0.3.8 -flask-sqlalchemy==2.4.1 -flask-sso==0.4.0 -future==0.18.2 -httpretty==1.0.2 -idna==2.9 +flask-sqlalchemy==2.4.4 +gunicorn==20.0.4 +httpretty==1.0.3 +idna==2.10 imagesize==1.2.0 -importlib-metadata==1.6.0 -inflection==0.4.0 +inflection==0.5.1 itsdangerous==1.1.0 jdcal==1.4.1 jinja2==2.11.2 jsonschema==3.2.0 -kombu==4.6.8 -ldap3==2.7 -lxml==4.5.1 -mako==1.1.2 +ldap3==2.8.1 +lxml==4.6.2 +mako==1.1.3 +markdown==3.3.3 markupsafe==1.1.1 -marshmallow==3.6.0 +marshmallow==3.9.1 marshmallow-enum==1.5.1 -marshmallow-sqlalchemy==0.23.0 -numpy==1.18.4 -openapi-spec-validator==0.2.8 -openpyxl==3.0.3 +marshmallow-sqlalchemy==0.24.1 +numpy==1.19.4 +openapi-spec-validator==0.2.9 +openpyxl==3.0.5 packaging==20.4 -pandas==1.0.3 -psycopg2-binary==2.8.5 +pandas==1.1.4 +psycopg2-binary==2.8.6 pyasn1==0.4.8 pycparser==2.20 -PyGithub==1.53 -pygments==2.6.1 +pygithub==1.53 +pygments==2.7.2 pyjwt==1.7.1 pyparsing==2.4.7 -pyrsistent==0.16.0 +pyrsistent==0.17.3 python-box==5.2.0 python-dateutil==2.8.1 python-docx==0.8.10 python-editor==1.0.4 -python-Levenshtein==0.12.0 -pytz==2020.1 +python-levenshtein==0.12.0 +pytz==2020.4 pyyaml==5.3.1 recommonmark==0.6.0 -requests==2.23.0 +requests==2.25.0 sentry-sdk==0.14.4 -six==1.14.0 +six==1.15.0 snowballstemmer==2.0.0 soupsieve==2.0.1 -sphinx==3.0.3 +sphinx==3.3.1 sphinxcontrib-applehelp==1.0.2 sphinxcontrib-devhelp==1.0.2 sphinxcontrib-htmlhelp==1.0.3 @@ -83,14 +78,14 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.4 spiffworkflow -sqlalchemy==1.3.17 -swagger-ui-bundle==0.0.6 -urllib3==1.25.9 -vine==1.3.0 -waitress==1.4.3 +sqlalchemy==1.3.20 +swagger-ui-bundle==0.0.8 +urllib3==1.26.2 +waitress==1.4.4 webob==1.8.6 webtest==2.0.35 werkzeug==1.0.1 +wrapt==1.12.1 +wtforms==2.3.3 xlrd==1.2.0 -xlsxwriter==1.2.8 -zipp==3.1.0 +xlsxwriter==1.3.7 From 9eea26e019de87a96a09baa66f27e3a9dd3d4472 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 11 Dec 2020 08:34:59 -0500 Subject: [PATCH 17/22] add workflow_sync test --- crc/api/workflow_sync.py | 8 ++++---- tests/base_test.py | 8 ++++++++ tests/test_workflow_sync.py | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/test_workflow_sync.py diff --git a/crc/api/workflow_sync.py b/crc/api/workflow_sync.py index 98f04236..c4256a68 100644 --- a/crc/api/workflow_sync.py +++ b/crc/api/workflow_sync.py @@ -31,7 +31,7 @@ def get_changed_workflows(remote,as_df=False): response = requests.get('http://'+remote+'/v1.0/workflow_sync/all',headers={'X-CR-API-KEY':app.config['API_TOKEN']}) except: raise ApiError("endpoint error", 'had a problem connecting to '+remote) - if response.status_code != 200: + if not response.ok: raise ApiError("endpoint error", response.text) remote = pd.DataFrame(json.loads(response.text)) @@ -102,7 +102,7 @@ def sync_changed_files(remote,workflow_spec_id): except: raise ApiError("endpoint error", 'had a problem connecting to '+remote) - if remotespectext.status_code != 200: + if not remotespectext.ok: raise ApiError("endpoint error", response.text) specdict = json.loads(remotespectext.text) @@ -170,7 +170,7 @@ def sync_changed_files(remote,workflow_spec_id): except: raise ApiError("endpoint error", 'had a problem connecting to ' + remote) - if response.status_code != 200: + if not response.ok: raise ApiError("endpoint error", response.text) FileService.update_file(currentfile,response.content,updatefile['type']) @@ -190,7 +190,7 @@ def get_changed_files(remote,workflow_spec_id,as_df=False): except: raise ApiError("endpoint error", 'had a problem connecting to '+remote) - if response.status_code != 200: + if not response.ok: raise ApiError("endpoint error", response.text) # This is probably very and may allow cross site attacks - fix later diff --git a/tests/base_test.py b/tests/base_test.py index af899917..e32bbd9b 100644 --- a/tests/base_test.py +++ b/tests/base_test.py @@ -204,6 +204,14 @@ class BaseTest(unittest.TestCase): data = myfile.read() return data + @staticmethod + def workflow_sync_response(file_name): + filepath = os.path.join(app.root_path, '..', 'tests', 'data', 'workflow_sync_responses', file_name) + with open(filepath, 'r') as myfile: + data = myfile.read() + return data + + def assert_success(self, rv, msg=""): try: data = json.loads(rv.get_data(as_text=True)) diff --git a/tests/test_workflow_sync.py b/tests/test_workflow_sync.py new file mode 100644 index 00000000..9a739c48 --- /dev/null +++ b/tests/test_workflow_sync.py @@ -0,0 +1,19 @@ +from unittest.mock import patch + +from crc import app +from tests.base_test import BaseTest +from crc.api.workflow_sync import get_all_spec_state, get_changed_workflows +import json +pip + +class TestWorkflowSync(BaseTest): + + @patch('crc.api.workflow_sync.requests.get') + def test_get_no_changes(self, mock_get): + othersys = get_all_spec_state() + mock_get.return_value.ok = True + mock_get.return_value.text = json.dumps(othersys) + response = get_changed_workflows('somewhere over the rainbow') + self.assertIsNotNone(response) + self.assertEqual(response,[]) + From 3a1160efac2348748b083d1b9dd4f83c11aee2b0 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 11 Dec 2020 11:41:32 -0500 Subject: [PATCH 18/22] refactored calls into a service --- crc/api.yml | 3 +- crc/api/workflow_sync.py | 51 ++++++-------------------- tests/study/test_study_api.py | 2 +- tests/test_workflow_sync.py | 68 +++++++++++++++++++++++++++++++---- 4 files changed, 76 insertions(+), 48 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index ce02307f..0939daa5 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -657,7 +657,8 @@ paths: get: operationId: crc.api.file.get_file_data_by_hash summary: Returns only the file contents - security: [] # Disable security for this endpoint only. + security: + - ApiKeyAuth: [] tags: - Files responses: diff --git a/crc/api/workflow_sync.py b/crc/api/workflow_sync.py index c4256a68..6a64fe7d 100644 --- a/crc/api/workflow_sync.py +++ b/crc/api/workflow_sync.py @@ -7,6 +7,7 @@ from crc.api.common import ApiError from crc.models.file import FileModel, FileDataModel from crc.models.workflow import WorkflowSpecModel, WorkflowSpecCategoryModel from crc.services.file_service import FileService +from crc.services.workflow_sync import WorkflowSyncService def join_uuids(uuids): @@ -27,21 +28,16 @@ def get_changed_workflows(remote,as_df=False): gets a remote endpoint - gets the workflows and then determines what workflows are different from the remote endpoint """ - try: - response = requests.get('http://'+remote+'/v1.0/workflow_sync/all',headers={'X-CR-API-KEY':app.config['API_TOKEN']}) - except: - raise ApiError("endpoint error", 'had a problem connecting to '+remote) - if not response.ok: - raise ApiError("endpoint error", response.text) - remote = pd.DataFrame(json.loads(response.text)) + remote_workflows_list = WorkflowSyncService.get_all_remote_workflows(remote) + remote_workflows = pd.DataFrame(remote_workflows_list) # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index local = get_all_spec_state_dataframe().reset_index() # merge these on workflow spec id and hash - this will # make two different date columns date_x and date_y - different = remote.merge(local, + different = remote_workflows.merge(local, right_on=['workflow_spec_id','md5_hash'], left_on=['workflow_spec_id','md5_hash'], how = 'outer' , @@ -66,7 +62,7 @@ def get_changed_workflows(remote,as_df=False): # get an exclusive or list of workflow ids - that is we want lists of files that are # on one machine or the other, but not both - remote_spec_ids = remote[['workflow_spec_id']] + remote_spec_ids = remote_workflows[['workflow_spec_id']] local_spec_ids = local[['workflow_spec_id']] left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])] right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])] @@ -96,16 +92,8 @@ def sync_all_changed_workflows(remote): def sync_changed_files(remote,workflow_spec_id): # make sure that spec is local before syncing files - try: - remotespectext = requests.get('http://'+remote+'/v1.0/workflow-specification/'+workflow_spec_id, - headers={'X-CR-API-KEY': app.config['API_TOKEN']}) - except: - raise ApiError("endpoint error", 'had a problem connecting to '+remote) - if not remotespectext.ok: - raise ApiError("endpoint error", response.text) - - specdict = json.loads(remotespectext.text) + specdict = WorkflowSyncService.get_remote_workfow_spec(remote,workflow_spec_id) localspec = session.query(WorkflowSpecModel).filter(WorkflowSpecModel.id == workflow_spec_id).first() if localspec is None: @@ -164,16 +152,8 @@ def sync_changed_files(remote,workflow_spec_id): currentfile.content_type = updatefile['content_type'] currentfile.primary_process_id = updatefile['primary_process_id'] session.add(currentfile) - try: - response = requests.get('http://'+remote+'/v1.0/file/'+updatefile['md5_hash']+'/hash_data', - headers={'X-CR-API-KEY': app.config['API_TOKEN']}) - except: - raise ApiError("endpoint error", 'had a problem connecting to ' + remote) - - if not response.ok: - raise ApiError("endpoint error", response.text) - - FileService.update_file(currentfile,response.content,updatefile['type']) + content = WorkflowSyncService.get_remote_file_by_hash(remote,updatefile['md5_hash']) + FileService.update_file(currentfile,content,updatefile['type']) session.commit() return [x['filename'] for x in updatefiles] @@ -184,21 +164,12 @@ def get_changed_files(remote,workflow_spec_id,as_df=False): local and remote and determines what files have been change and returns a list of those files """ - try: - response = requests.get('http://'+remote+'/v1.0/workflow_sync/'+workflow_spec_id+'/files', - headers={'X-CR-API-KEY':app.config['API_TOKEN']}) - except: - raise ApiError("endpoint error", 'had a problem connecting to '+remote) - - if not response.ok: - raise ApiError("endpoint error", response.text) - - # This is probably very and may allow cross site attacks - fix later - remote = pd.DataFrame(json.loads(response.text)) + remote_file_list = WorkflowSyncService.get_remote_workflow_spec_files(remote,workflow_spec_id) + remote_files = pd.DataFrame(remote_file_list) # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index() local['md5_hash'] = local['md5_hash'].astype('str') - different = remote.merge(local, + different = remote_files.merge(local, right_on=['filename','md5_hash'], left_on=['filename','md5_hash'], how = 'outer' , diff --git a/tests/study/test_study_api.py b/tests/study/test_study_api.py index e7d33a9d..df734086 100644 --- a/tests/study/test_study_api.py +++ b/tests/study/test_study_api.py @@ -165,7 +165,7 @@ class TestStudyApi(BaseTest): self.assertEqual(study_event.comment, update_comment) self.assertEqual(study_event.user_uid, self.test_uid) - @patch('crc.services.protocol_builder.ProtocolBuilderService.get_investigators') # mock_studies + @patch('crc.services.protocol_builder.ProtocolBuilderService.get_investigators') # mock_investigators @patch('crc.services.protocol_builder.ProtocolBuilderService.get_required_docs') # mock_docs @patch('crc.services.protocol_builder.ProtocolBuilderService.get_study_details') # mock_details @patch('crc.services.protocol_builder.ProtocolBuilderService.get_studies') # mock_studies diff --git a/tests/test_workflow_sync.py b/tests/test_workflow_sync.py index 9a739c48..f2c68719 100644 --- a/tests/test_workflow_sync.py +++ b/tests/test_workflow_sync.py @@ -1,19 +1,75 @@ from unittest.mock import patch -from crc import app +from crc import db from tests.base_test import BaseTest from crc.api.workflow_sync import get_all_spec_state, get_changed_workflows +from crc.models.workflow import WorkflowSpecModel import json -pip +from datetime import datetime +from crc.services.file_service import FileService class TestWorkflowSync(BaseTest): - @patch('crc.api.workflow_sync.requests.get') + @patch('crc.services.workflow_sync.WorkflowSyncService.get_all_remote_workflows') def test_get_no_changes(self, mock_get): + self.load_example_data() othersys = get_all_spec_state() - mock_get.return_value.ok = True - mock_get.return_value.text = json.dumps(othersys) - response = get_changed_workflows('somewhere over the rainbow') + mock_get.return_value = othersys + response = get_changed_workflows('localhost:0000') # not actually used due to mock self.assertIsNotNone(response) self.assertEqual(response,[]) + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_all_remote_workflows') + def test_remote_workflow_change(self, mock_get): + self.load_example_data() + othersys = get_all_spec_state() + othersys[1]['date_created'] = str(datetime.now()) + othersys[1]['md5_hash'] = '12345' + mock_get.return_value = othersys + response = get_changed_workflows('localhost:0000') #endpoint is not used due to mock + self.assertIsNotNone(response) + self.assertEqual(len(response),1) + self.assertEqual(response[0]['workflow_spec_id'], 'random_fact') + self.assertEqual(response[0]['location'], 'remote') + self.assertEqual(response[0]['new'], False) + + + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_all_remote_workflows') + def test_remote_workflow_has_new(self, mock_get): + self.load_example_data() + othersys = get_all_spec_state() + othersys.append({'workflow_spec_id':'my_new_workflow', + 'date_created':str(datetime.now()), + 'md5_hash': '12345'}) + mock_get.return_value = othersys + response = get_changed_workflows('localhost:0000') #endpoint is not used due to mock + self.assertIsNotNone(response) + self.assertEqual(len(response),1) + self.assertEqual(response[0]['workflow_spec_id'],'my_new_workflow') + self.assertEqual(response[0]['location'], 'remote') + self.assertEqual(response[0]['new'], True) + + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_all_remote_workflows') + def test_local_workflow_has_new(self, mock_get): + self.load_example_data() + + othersys = get_all_spec_state() + mock_get.return_value = othersys + wf_spec = WorkflowSpecModel() + wf_spec.id = 'abcdefg' + wf_spec.display_name = 'New Workflow - Yum!!' + wf_spec.name = 'my_new_workflow' + wf_spec.description = 'yep - its a new workflow' + wf_spec.category_id = 0 + wf_spec.display_order = 0 + db.session.add(wf_spec) + db.session.commit() + FileService.add_workflow_spec_file(wf_spec,'dummyfile.txt','text',b'this is a test') + # after setting up the test - I realized that this doesn't return anything for + # a workflow that is new locally - it just returns nothing + response = get_changed_workflows('localhost:0000') #endpoint is not used due to mock + self.assertIsNotNone(response) + self.assertEqual(response,[]) From 55e6f5b753f7ad12f4d7e719161824c206af4d79 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 11 Dec 2020 11:42:00 -0500 Subject: [PATCH 19/22] refactored calls into a service - forgot to add actual service --- crc/services/workflow_sync.py | 57 +++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 crc/services/workflow_sync.py diff --git a/crc/services/workflow_sync.py b/crc/services/workflow_sync.py new file mode 100644 index 00000000..e2026746 --- /dev/null +++ b/crc/services/workflow_sync.py @@ -0,0 +1,57 @@ +import json +from json import JSONDecodeError +from typing import List, Optional + +import requests + +from crc import app +from crc.api.common import ApiError + + +class WorkflowSyncService(object): + + @staticmethod + def get_remote_file_by_hash(remote,md5_hash): + url = remote+'/v1.0/file/'+md5_hash+'/hash_data' + return WorkflowSyncService.__make_request(url,return_contents=True) + + @staticmethod + def get_remote_workflow_spec_files(remote,workflow_spec_id): + url = remote+'/v1.0/workflow_sync/'+workflow_spec_id+'/files' + return WorkflowSyncService.__make_request(url) + + @staticmethod + def get_remote_workflow_spec(remote, workflow_spec_id): + """ + this just gets the details of a workflow spec from the + remote side. + + FIXME: for testing I had changed the security on the API endpoint + below so that I could run it - I need to restore the security on this + and make a new workflow_sync endpoint that just calls this same function + so that I can use the API_TOKEN rather than the other token setup + """ + url = remote+'/v1.0/workflow-specification/'+workflow_spec_id + return WorkflowSyncService.__make_request(url) + + @staticmethod + def get_all_remote_workflows(remote): + url = remote + '/v1.0/workflow_sync/all' + return WorkflowSyncService.__make_request(url) + + @staticmethod + def __make_request(url,return_contents=False): + try: + response = requests.get(url,headers={'X-CR-API-KEY':app.config['API_TOKEN']}) + except: + raise ApiError("workflow_sync_error",response.text) + if response.ok and response.text: + if return_contents: + return response.content + else: + return json.loads(response.text) + else: + raise ApiError("workflow_sync_error", + "Received an invalid response from the protocol builder (status %s): %s when calling " + "url '%s'." % + (response.status_code, response.text, url)) From adc4dc4453392cbae7665a6e0e0b36fe90eb3b5a Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 11 Dec 2020 12:03:41 -0500 Subject: [PATCH 20/22] redid the api a bit so that nothing was using open security - added a new endpoint for getting a workflow spec that uses the alternate API_TOKEN security and leave the original endpoint as it was. --- crc/api.yml | 24 +++++++++++++++++++++++- crc/api/workflow_sync.py | 4 ++++ crc/services/workflow_sync.py | 7 +------ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index 0939daa5..a4b799ea 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -156,6 +156,29 @@ paths: items: $ref: "#/components/schemas/WorkflowSpecDiffList" + /workflow_sync/{workflow_spec_id}/spec: + parameters: + - name: workflow_spec_id + in: path + required: false + description: The unique id of an existing workflow specification to modify. + schema: + type: string + get: + operationId: crc.api.workflow_sync.get_sync_workflow_specification + summary: Returns a single workflow specification + security: + - ApiKeyAuth: [] + tags: + - Workflow Sync API + responses: + '200': + description: Workflow specification. + content: + application/json: + schema: + $ref: "#/components/schemas/WorkflowSpec" + /workflow_sync/{workflow_spec_id}/files: get: @@ -401,7 +424,6 @@ paths: get: operationId: crc.api.workflow.get_workflow_specification summary: Returns a single workflow specification - security: [] tags: - Workflow Specifications responses: diff --git a/crc/api/workflow_sync.py b/crc/api/workflow_sync.py index 6a64fe7d..4915af0f 100644 --- a/crc/api/workflow_sync.py +++ b/crc/api/workflow_sync.py @@ -8,8 +8,12 @@ from crc.models.file import FileModel, FileDataModel from crc.models.workflow import WorkflowSpecModel, WorkflowSpecCategoryModel from crc.services.file_service import FileService from crc.services.workflow_sync import WorkflowSyncService +from crc.api.workflow import get_workflow_specification +def get_sync_workflow_specification(workflow_spec_id): + return get_workflow_specification(workflow_spec_id) + def join_uuids(uuids): """Joins a pandas Series of uuids and combines them in one hash""" combined_uuids = ''.join([str(uuid) for uuid in uuids.sort_values()]) # ensure that values are always diff --git a/crc/services/workflow_sync.py b/crc/services/workflow_sync.py index e2026746..26796413 100644 --- a/crc/services/workflow_sync.py +++ b/crc/services/workflow_sync.py @@ -25,13 +25,8 @@ class WorkflowSyncService(object): """ this just gets the details of a workflow spec from the remote side. - - FIXME: for testing I had changed the security on the API endpoint - below so that I could run it - I need to restore the security on this - and make a new workflow_sync endpoint that just calls this same function - so that I can use the API_TOKEN rather than the other token setup """ - url = remote+'/v1.0/workflow-specification/'+workflow_spec_id + url = remote+'/v1.0/workflow-sync/'+workflow_spec_id+'/spec' return WorkflowSyncService.__make_request(url) @staticmethod From ee3ee9fd4a7439b69285b6089d2d151aac4f8f52 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Mon, 14 Dec 2020 10:27:40 -0500 Subject: [PATCH 21/22] Added tests to cover most of the use cases and code, and a bunch of stuff to make the mocks happy --- crc/api/workflow_sync.py | 6 +- tests/base_test.py | 2 +- tests/data/random_fact/random_fact2.bpmn | 200 ++++++++++++++++++ .../workflow_sync_responses/random_fact2.bpmn | 104 +++++++++ tests/test_workflow_sync.py | 85 +++++++- 5 files changed, 392 insertions(+), 5 deletions(-) create mode 100644 tests/data/random_fact/random_fact2.bpmn create mode 100644 tests/data/workflow_sync_responses/random_fact2.bpmn diff --git a/crc/api/workflow_sync.py b/crc/api/workflow_sync.py index 4915af0f..68cd9fd2 100644 --- a/crc/api/workflow_sync.py +++ b/crc/api/workflow_sync.py @@ -97,7 +97,7 @@ def sync_all_changed_workflows(remote): def sync_changed_files(remote,workflow_spec_id): # make sure that spec is local before syncing files - specdict = WorkflowSyncService.get_remote_workfow_spec(remote,workflow_spec_id) + specdict = WorkflowSyncService.get_remote_workflow_spec(remote,workflow_spec_id) localspec = session.query(WorkflowSpecModel).filter(WorkflowSpecModel.id == workflow_spec_id).first() if localspec is None: @@ -173,6 +173,8 @@ def get_changed_files(remote,workflow_spec_id,as_df=False): # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index() local['md5_hash'] = local['md5_hash'].astype('str') + remote_files['md5_hash'] = remote_files['md5_hash'].astype('str') + different = remote_files.merge(local, right_on=['filename','md5_hash'], left_on=['filename','md5_hash'], @@ -205,7 +207,7 @@ def get_changed_files(remote,workflow_spec_id,as_df=False): # get an exclusive or list of workflow ids - that is we want lists of files that are # on one machine or the other, but not both - remote_spec_ids = remote[['filename']] + remote_spec_ids = remote_files[['filename']] local_spec_ids = local[['filename']] left = remote_spec_ids[~remote_spec_ids['filename'].isin(local_spec_ids['filename'])] right = local_spec_ids[~local_spec_ids['filename'].isin(remote_spec_ids['filename'])] diff --git a/tests/base_test.py b/tests/base_test.py index e32bbd9b..4f2306a2 100644 --- a/tests/base_test.py +++ b/tests/base_test.py @@ -207,7 +207,7 @@ class BaseTest(unittest.TestCase): @staticmethod def workflow_sync_response(file_name): filepath = os.path.join(app.root_path, '..', 'tests', 'data', 'workflow_sync_responses', file_name) - with open(filepath, 'r') as myfile: + with open(filepath, 'rb') as myfile: data = myfile.read() return data diff --git a/tests/data/random_fact/random_fact2.bpmn b/tests/data/random_fact/random_fact2.bpmn new file mode 100644 index 00000000..bd0a9ef5 --- /dev/null +++ b/tests/data/random_fact/random_fact2.bpmn @@ -0,0 +1,200 @@ + + + + + SequenceFlow_0c7wlth + + + # h1 Heading 8-) +## h2 Heading +### h3 Heading +#### h4 Heading +##### h5 Heading +###### h6 Heading + + +## Horizontal Rules + +___ + +--- + +*** + + +## Typographic replacements + +"double quotes" and 'single quotes' + + +## Emphasis + +**This is bold text** + +__This is bold text__ + +*This is italic text* + +_This is italic text_ + +~~Strikethrough~~ + + +## Blockquotes + + +> Blockquotes can also be nested... +>> ...by using additional greater-than signs right next to each other... +> > > ...or with spaces between arrows. + + +## Lists + +Unordered + ++ Create a list by starting a line with `+`, `-`, or `*` ++ Sub-lists are made by indenting 2 spaces: + - Marker character change forces new list start: + * Ac tristique libero volutpat at + + Facilisis in pretium nisl aliquet + - Nulla volutpat aliquam velit ++ Very easy! + +Ordered + +1. Lorem ipsum dolor sit amet +2. Consectetur adipiscing elit +3. Integer molestie lorem at massa + + +1. You can use sequential numbers... +1. ...or keep all the numbers as `1.` + +Start numbering with offset: + +57. foo +1. bar + +## Tables + +| Option | Description | +| ------ | ----------- | +| data | path to data files to supply the data that will be passed into templates. | +| engine | engine to be used for processing templates. Handlebars is the default. | +| ext | extension to be used for dest files. | + +Right aligned columns + +| Option | Description | +| ------:| -----------:| +| data | path to data files to supply the data that will be passed into templates. | +| engine | engine to be used for processing templates. Handlebars is the default. | +| ext | extension to be used for dest files. | + + +## Links + +[link text](http://dev.nodeca.com) + +[link with title](http://nodeca.github.io/pica/demo/ "title text!") + +Autoconverted link https://github.com/nodeca/pica (enable linkify to see) + + +## Images + +![Minion](https://octodex.github.com/images/minion.png) +![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg "The Stormtroopocat") + + + + + + + + + + + + + + + + SequenceFlow_0c7wlth + SequenceFlow_0641sh6 + + + + + + + + + SequenceFlow_0641sh6 + SequenceFlow_0t29gjo + FactService = fact_service() + + + # Great Job! +You have completed the random fact generator. +You chose to receive a random fact of the type: "{{type}}" + +Your random fact is: +{{details}} + SequenceFlow_0t29gjo + + + + + + User sets the Fact.type to cat, norris, or buzzword + + + + Makes an API  call to get a fact of the required type. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/workflow_sync_responses/random_fact2.bpmn b/tests/data/workflow_sync_responses/random_fact2.bpmn new file mode 100644 index 00000000..f502a238 --- /dev/null +++ b/tests/data/workflow_sync_responses/random_fact2.bpmn @@ -0,0 +1,104 @@ + + + + + SequenceFlow_0c7wlth + + + # h1 Heading 8-) + NEW_FILE_ADDED + +![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg "The Stormtroopocat") + + + + + + + + + + + + + + + + SequenceFlow_0c7wlth + SequenceFlow_0641sh6 + + + + + + + + + SequenceFlow_0641sh6 + SequenceFlow_0t29gjo + FactService = fact_service() + + + # Great Job! +You have completed the random fact generator. +You chose to receive a random fact of the type: "{{type}}" + +Your random fact is: +{{details}} + SequenceFlow_0t29gjo + + + + + + User sets the Fact.type to cat, norris, or buzzword + + + + Makes an API  call to get a fact of the required type. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_workflow_sync.py b/tests/test_workflow_sync.py index f2c68719..eb0cd1b5 100644 --- a/tests/test_workflow_sync.py +++ b/tests/test_workflow_sync.py @@ -2,12 +2,18 @@ from unittest.mock import patch from crc import db from tests.base_test import BaseTest -from crc.api.workflow_sync import get_all_spec_state, get_changed_workflows +from crc.api.workflow_sync import get_all_spec_state, \ + get_changed_workflows, \ + get_workflow_spec_files, \ + get_changed_files, \ + get_workflow_specification, \ + sync_changed_files from crc.models.workflow import WorkflowSpecModel -import json from datetime import datetime from crc.services.file_service import FileService + + class TestWorkflowSync(BaseTest): @patch('crc.services.workflow_sync.WorkflowSyncService.get_all_remote_workflows') @@ -73,3 +79,78 @@ class TestWorkflowSync(BaseTest): response = get_changed_workflows('localhost:0000') #endpoint is not used due to mock self.assertIsNotNone(response) self.assertEqual(response,[]) + + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_workflow_spec_files') + def test_file_differences(self, mock_get): + self.load_example_data() + othersys = get_workflow_spec_files('random_fact') + othersys[1]['date_created'] = str(datetime.now()) + othersys[1]['md5_hash'] = '12345' + mock_get.return_value = othersys + response = get_changed_files('localhost:0000','random_fact',as_df=False) #endpoint is not used due to mock + self.assertIsNotNone(response) + self.assertEqual(len(response),1) + self.assertEqual(response[0]['filename'], 'random_fact2.bpmn') + self.assertEqual(response[0]['location'], 'remote') + self.assertEqual(response[0]['new'], False) + + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_workflow_spec_files') + def test_file_differences(self, mock_get): + self.load_example_data() + othersys = get_workflow_spec_files('random_fact') + othersys[1]['date_created'] = str(datetime.now()) + othersys[1]['md5_hash'] = '12345' + mock_get.return_value = othersys + response = get_changed_files('localhost:0000','random_fact',as_df=False) #endpoint is not used due to mock + self.assertIsNotNone(response) + self.assertEqual(len(response),1) + self.assertEqual(response[0]['filename'], 'random_fact2.bpmn') + self.assertEqual(response[0]['location'], 'remote') + self.assertEqual(response[0]['new'], False) + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_file_by_hash') + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_workflow_spec_files') + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_workflow_spec') + def test_file_differences(self, workflow_mock, spec_files_mock, file_data_mock): + self.load_example_data() + remote_workflow = get_workflow_specification('random_fact') + self.assertEqual(remote_workflow['display_name'],'Random Fact') + remote_workflow['description'] = 'This Workflow came from Remote' + remote_workflow['display_name'] = 'Remote Workflow' + workflow_mock.return_value = remote_workflow + othersys = get_workflow_spec_files('random_fact') + othersys[1]['date_created'] = str(datetime.now()) + othersys[1]['md5_hash'] = '12345' + spec_files_mock.return_value = othersys + file_data_mock.return_value = self.workflow_sync_response('random_fact2.bpmn') + response = sync_changed_files('localhost:0000','random_fact') # endpoint not used due to mock + self.assertIsNotNone(response) + self.assertEqual(len(response),1) + self.assertEqual(response[0], 'random_fact2.bpmn') + files = FileService.get_spec_data_files('random_fact') + md5sums = [str(f.md5_hash) for f in files] + self.assertEqual('21bb6f9e-0af7-0ab2-0fc7-ec0f94787e58' in md5sums, True) + new_local_workflow = get_workflow_specification('random_fact') + self.assertEqual(new_local_workflow['display_name'],'Remote Workflow') + + + + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_workflow_spec_files') + @patch('crc.services.workflow_sync.WorkflowSyncService.get_remote_workflow_spec') + def test_file_deleted(self, workflow_mock, spec_files_mock): + self.load_example_data() + remote_workflow = get_workflow_specification('random_fact') + workflow_mock.return_value = remote_workflow + othersys = get_workflow_spec_files('random_fact') + del(othersys[1]) + spec_files_mock.return_value = othersys + response = sync_changed_files('localhost:0000','random_fact') # endpoint not used due to mock + self.assertIsNotNone(response) + # when we delete a local file, we do not return that it was deleted - just + # a list of updated files. We may want to change this in the future. + self.assertEqual(len(response),0) + files = FileService.get_spec_data_files('random_fact') + self.assertEqual(len(files),1) + From d8ac20b1c33da1bec8e01a8f2ba455b3d3243835 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Mon, 14 Dec 2020 10:37:16 -0500 Subject: [PATCH 22/22] I added a second file to 'random_fact' test workflow, so another test was expecting 2 files in it after adding a new file, but there were 3 - Nothing to see here - move along --- tests/files/test_files_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/files/test_files_api.py b/tests/files/test_files_api.py index 02feb8d0..958989d1 100644 --- a/tests/files/test_files_api.py +++ b/tests/files/test_files_api.py @@ -46,7 +46,7 @@ class TestFilesApi(BaseTest): content_type="application/json", headers=self.logged_in_headers()) self.assert_success(rv) json_data = json.loads(rv.get_data(as_text=True)) - self.assertEqual(2, len(json_data)) + self.assertEqual(3, len(json_data)) def test_create_file(self):