For a given workflow - find the files that are different from a remote endpoint for the same workflow

This commit is contained in:
Kelly McDonald 2020-12-04 11:49:07 -05:00
parent 3e8d4ca7c9
commit d41d018fe3
2 changed files with 140 additions and 2 deletions

View File

@ -126,10 +126,69 @@ paths:
$ref: "#/components/schemas/Study"
/workflow_spec/{workflow_spec_id}/files:
get:
operationId: crc.api.workflow.get_workflow_spec_files
summary: Provides a list of workflow specs and their signature
security: [] # Disable security for this endpoint only - we'll sanity check
# in the endpoint
parameters:
- name: workflow_spec_id
in: path
required: true
description: The workflow_spec id
schema:
type: string
tags:
- Workflow Spec States
responses:
'200':
description: An array of workflow specs, with last touched date and file signature.
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/Study"
/workflow_spec/{workflow_spec_id}/files/diff:
get:
operationId: crc.api.workflow.get_changed_files
summary: Provides a list of workflow specs and their signature
security: [] # Disable security for this endpoint only - we'll sanity check
# in the endpoint
parameters:
- name: workflow_spec_id
in: path
required: true
description: The workflow_spec id
schema:
type: string
- name: remote
in: query
required: true
description: The remote endpoint
schema:
type: string
tags:
- Workflow Spec States
responses:
'200':
description: An array of workflow specs, with last touched date and file signature.
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/Study"
/workflow_spec/all:
get:
operationId: crc.api.workflow.get_all_spec_state
summary: Provides a list of workflow specs and their signature
summary: Provides a list of files for a workflow spec
security: [] # Disable security for this endpoint only - we'll sanity check
# in the endpoint
tags:

View File

@ -320,6 +320,53 @@ def get_changed_workflows(remote):
return output.reset_index().to_dict(orient='records')
def get_changed_files(remote,workflow_spec_id):
"""
gets a remote endpoint - gets the files for a workflow_spec on both
local and remote and determines what files have been change and returns a list of those
files
"""
response = requests.get('http://'+remote+'/v1.0/workflow_spec/'+workflow_spec_id+'/files')
# This is probably very and may allow cross site attacks - fix later
remote = pd.DataFrame(json.loads(response.text))
# get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index
local = get_workflow_spec_files_dataframe(workflow_spec_id).reset_index()
local['md5_hash'] = local['md5_hash'].astype('str')
different = remote.merge(local,
right_on=['filename','md5_hash'],
left_on=['filename','md5_hash'],
how = 'outer' ,
indicator=True).loc[lambda x : x['_merge']!='both']
# each line has a tag on it - if was in the left or the right,
# label it so we know if that was on the remote or local machine
different.loc[different['_merge']=='left_only','location'] = 'remote'
different.loc[different['_merge']=='right_only','location'] = 'local'
# this takes the different date_created_x and date-created_y columns and
# combines them back into one date_created column
index = different['date_created_x'].isnull()
different.loc[index,'date_created_x'] = different[index]['date_created_y']
different = different[['date_created_x','filename','location']].copy()
different.columns=['date_created','filename','location']
# our different list will have multiple entries for a workflow if there is a version on either side
# we want to grab the most recent one, so we sort and grab the most recent one for each workflow
changedfiles = different.sort_values('date_created',ascending=False).groupby('filename').first()
# get an exclusive or list of workflow ids - that is we want lists of files that are
# on one machine or the other, but not both
remote_spec_ids = remote[['filename']]
local_spec_ids = local[['filename']]
left = remote_spec_ids[~remote_spec_ids['filename'].isin(local_spec_ids['filename'])]
right = local_spec_ids[~local_spec_ids['filename'].isin(remote_spec_ids['filename'])]
changedfiles['new'] = False
changedfiles.loc[changedfiles.index.isin(left['filename']), 'new'] = True
changedfiles.loc[changedfiles.index.isin(right['filename']),'new'] = True
# return the list as a dict, let swagger convert it to json
return changedfiles.reset_index().to_dict(orient='records')
def get_all_spec_state():
@ -331,6 +378,39 @@ def get_all_spec_state():
df = get_all_spec_state_dataframe()
return df.reset_index().to_dict(orient='records')
def get_workflow_spec_files(workflow_spec_id):
"""
Return a list of all workflow specs along with last updated date and a
thumbprint of all of the files that are used for that workflow_spec
Convert into a dict list from a dataframe
"""
df = get_workflow_spec_files_dataframe(workflow_spec_id)
return df.reset_index().to_dict(orient='records')
def get_workflow_spec_files_dataframe(workflowid):
"""
Return a list of all files for a workflow_spec along with last updated date and a
hash so we can determine file differences for a changed workflow on a box.
Return a dataframe
"""
x = session.query(FileDataModel).join(FileModel).filter(FileModel.workflow_spec_id==workflowid)
# there might be a cleaner way of getting a data frome from some of the
# fields in the ORM - but this works OK
filelist = []
for file in x:
filelist.append({'file_model_id':file.file_model_id,
'workflow_spec_id': file.file_model.workflow_spec_id,
'md5_hash':file.md5_hash,
'filename':file.file_model.name,
'date_created':file.date_created})
df = pd.DataFrame(filelist).sort_values('date_created').groupby('file_model_id').last()
df['date_created'] = df['date_created'].astype('str')
return df
def get_all_spec_state_dataframe():
"""
Return a list of all workflow specs along with last updated date and a
@ -338,7 +418,6 @@ def get_all_spec_state_dataframe():
Return a dataframe
"""
x = session.query(FileDataModel).join(FileModel)
# there might be a cleaner way of getting a data frome from some of the
# fields in the ORM - but this works OK
filelist = []