From d41d018fe34fa91569c5c64225c69622514e0201 Mon Sep 17 00:00:00 2001 From: Kelly McDonald Date: Fri, 4 Dec 2020 11:49:07 -0500 Subject: [PATCH] For a given workflow - find the files that are different from a remote endpoint for the same workflow --- crc/api.yml | 61 +++++++++++++++++++++++++++++++++- crc/api/workflow.py | 81 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 2 deletions(-) diff --git a/crc/api.yml b/crc/api.yml index 959d6061..afeb2bc2 100644 --- a/crc/api.yml +++ b/crc/api.yml @@ -126,10 +126,69 @@ paths: $ref: "#/components/schemas/Study" + /workflow_spec/{workflow_spec_id}/files: + get: + operationId: crc.api.workflow.get_workflow_spec_files + summary: Provides a list of workflow specs and their signature + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint + parameters: + - name: workflow_spec_id + in: path + required: true + description: The workflow_spec id + schema: + type: string + + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + + /workflow_spec/{workflow_spec_id}/files/diff: + get: + operationId: crc.api.workflow.get_changed_files + summary: Provides a list of workflow specs and their signature + security: [] # Disable security for this endpoint only - we'll sanity check + # in the endpoint + parameters: + - name: workflow_spec_id + in: path + required: true + description: The workflow_spec id + schema: + type: string + - name: remote + in: query + required: true + description: The remote endpoint + schema: + type: string + + tags: + - Workflow Spec States + responses: + '200': + description: An array of workflow specs, with last touched date and file signature. + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/Study" + + /workflow_spec/all: get: operationId: crc.api.workflow.get_all_spec_state - summary: Provides a list of workflow specs and their signature + summary: Provides a list of files for a workflow spec security: [] # Disable security for this endpoint only - we'll sanity check # in the endpoint tags: diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 726aa78e..9484d183 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -320,6 +320,53 @@ def get_changed_workflows(remote): return output.reset_index().to_dict(orient='records') +def get_changed_files(remote,workflow_spec_id): + """ + gets a remote endpoint - gets the files for a workflow_spec on both + local and remote and determines what files have been change and returns a list of those + files + """ + response = requests.get('http://'+remote+'/v1.0/workflow_spec/'+workflow_spec_id+'/files') + # This is probably very and may allow cross site attacks - fix later + remote = pd.DataFrame(json.loads(response.text)) + # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index + local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index() + local['md5_hash'] = local['md5_hash'].astype('str') + different = remote.merge(local, + right_on=['filename','md5_hash'], + left_on=['filename','md5_hash'], + how = 'outer' , + indicator=True).loc[lambda x : x['_merge']!='both'] + + # each line has a tag on it - if was in the left or the right, + # label it so we know if that was on the remote or local machine + different.loc[different['_merge']=='left_only','location'] = 'remote' + different.loc[different['_merge']=='right_only','location'] = 'local' + + # this takes the different date_created_x and date-created_y columns and + # combines them back into one date_created column + index = different['date_created_x'].isnull() + different.loc[index,'date_created_x'] = different[index]['date_created_y'] + different = different[['date_created_x','filename','location']].copy() + + different.columns=['date_created','filename','location'] + # our different list will have multiple entries for a workflow if there is a version on either side + # we want to grab the most recent one, so we sort and grab the most recent one for each workflow + changedfiles = different.sort_values('date_created',ascending=False).groupby('filename').first() + + # get an exclusive or list of workflow ids - that is we want lists of files that are + # on one machine or the other, but not both + remote_spec_ids = remote[['filename']] + local_spec_ids = local[['filename']] + left = remote_spec_ids[~remote_spec_ids['filename'].isin(local_spec_ids['filename'])] + right = local_spec_ids[~local_spec_ids['filename'].isin(remote_spec_ids['filename'])] + changedfiles['new'] = False + changedfiles.loc[changedfiles.index.isin(left['filename']), 'new'] = True + changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True + + # return the list as a dict, let swagger convert it to json + return changedfiles.reset_index().to_dict(orient='records') + def get_all_spec_state(): @@ -331,6 +378,39 @@ def get_all_spec_state(): df = get_all_spec_state_dataframe() return df.reset_index().to_dict(orient='records') + +def get_workflow_spec_files(workflow_spec_id): + """ + Return a list of all workflow specs along with last updated date and a + thumbprint of all of the files that are used for that workflow_spec + Convert into a dict list from a dataframe + """ + df = get_workflow_spec_files_dataframe(workflow_spec_id) + return df.reset_index().to_dict(orient='records') + + +def get_workflow_spec_files_dataframe(workflowid): + """ + Return a list of all files for a workflow_spec along with last updated date and a + hash so we can determine file differences for a changed workflow on a box. + Return a dataframe + """ + x = session.query(FileDataModel).join(FileModel).filter(FileModel.workflow_spec_id==workflowid) + # there might be a cleaner way of getting a data frome from some of the + # fields in the ORM - but this works OK + filelist = [] + for file in x: + filelist.append({'file_model_id':file.file_model_id, + 'workflow_spec_id': file.file_model.workflow_spec_id, + 'md5_hash':file.md5_hash, + 'filename':file.file_model.name, + 'date_created':file.date_created}) + df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last() + df['date_created'] = df['date_created'].astype('str') + return df + + + def get_all_spec_state_dataframe(): """ Return a list of all workflow specs along with last updated date and a @@ -338,7 +418,6 @@ def get_all_spec_state_dataframe(): Return a dataframe """ x = session.query(FileDataModel).join(FileModel) - # there might be a cleaner way of getting a data frome from some of the # fields in the ORM - but this works OK filelist = []