Add some inline documentation to make the process more clear
This commit is contained in:
parent
10dce542ec
commit
3e8d4ca7c9
|
@ -273,31 +273,50 @@ def get_changed_workflows(remote):
|
||||||
gets a remote endpoint - gets the workflows and then
|
gets a remote endpoint - gets the workflows and then
|
||||||
determines what workflows are different from the remote endpoint
|
determines what workflows are different from the remote endpoint
|
||||||
"""
|
"""
|
||||||
x = requests.get('http://'+remote+'/v1.0/workflow_spec/all')
|
response = requests.get('http://'+remote+'/v1.0/workflow_spec/all')
|
||||||
|
|
||||||
# This is probably very and may allow cross site attacks - fix later
|
# This is probably very and may allow cross site attacks - fix later
|
||||||
remote = pd.DataFrame(eval(x.text))
|
remote = pd.DataFrame(json.loads(response.text))
|
||||||
|
# get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index
|
||||||
local = get_all_spec_state_dataframe().reset_index()
|
local = get_all_spec_state_dataframe().reset_index()
|
||||||
different = remote.merge(local, right_on=['workflow_spec_id','md5_hash'], left_on=['workflow_spec_id',
|
|
||||||
'md5_hash'], how = 'outer' , indicator=True).loc[
|
|
||||||
lambda x : x['_merge']!='both']
|
|
||||||
|
|
||||||
|
# merge these on workflow spec id and hash - this will
|
||||||
|
# make two different date columns date_x and date_y
|
||||||
|
different = remote.merge(local,
|
||||||
|
right_on=['workflow_spec_id','md5_hash'],
|
||||||
|
left_on=['workflow_spec_id','md5_hash'],
|
||||||
|
how = 'outer' ,
|
||||||
|
indicator=True).loc[lambda x : x['_merge']!='both']
|
||||||
|
|
||||||
|
# each line has a tag on it - if was in the left or the right,
|
||||||
|
# label it so we know if that was on the remote or local machine
|
||||||
different.loc[different['_merge']=='left_only','location'] = 'remote'
|
different.loc[different['_merge']=='left_only','location'] = 'remote'
|
||||||
different.loc[different['_merge']=='right_only','location'] = 'local'
|
different.loc[different['_merge']=='right_only','location'] = 'local'
|
||||||
#changedfiles = different.copy()
|
|
||||||
|
# this takes the different date_created_x and date-created_y columns and
|
||||||
|
# combines them back into one date_created column
|
||||||
index = different['date_created_x'].isnull()
|
index = different['date_created_x'].isnull()
|
||||||
different.loc[index,'date_created_x'] = different[index]['date_created_y']
|
different.loc[index,'date_created_x'] = different[index]['date_created_y']
|
||||||
different = different[['workflow_spec_id','date_created_x','location']].copy()
|
different = different[['workflow_spec_id','date_created_x','location']].copy()
|
||||||
different.columns=['workflow_spec_id','date_created','location']
|
different.columns=['workflow_spec_id','date_created','location']
|
||||||
|
|
||||||
|
# our different list will have multiple entries for a workflow if there is a version on either side
|
||||||
|
# we want to grab the most recent one, so we sort and grab the most recent one for each workflow
|
||||||
changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first()
|
changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first()
|
||||||
|
|
||||||
|
# get an exclusive or list of workflow ids - that is we want lists of files that are
|
||||||
|
# on one machine or the other, but not both
|
||||||
remote_spec_ids = remote[['workflow_spec_id']]
|
remote_spec_ids = remote[['workflow_spec_id']]
|
||||||
local_spec_ids = local[['workflow_spec_id']]
|
local_spec_ids = local[['workflow_spec_id']]
|
||||||
|
|
||||||
left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])]
|
left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])]
|
||||||
right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])]
|
right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])]
|
||||||
|
|
||||||
|
# flag files as new that are only on the remote box and remove the files that are only on the local box
|
||||||
changedfiles['new'] = False
|
changedfiles['new'] = False
|
||||||
changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True
|
changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True
|
||||||
output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])]
|
output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])]
|
||||||
|
|
||||||
|
# return the list as a dict, let swagger convert it to json
|
||||||
return output.reset_index().to_dict(orient='records')
|
return output.reset_index().to_dict(orient='records')
|
||||||
|
|
||||||
|
|
||||||
|
@ -339,9 +358,9 @@ def get_all_spec_state_dataframe():
|
||||||
# workflow spec
|
# workflow spec
|
||||||
df = df.groupby('workflow_spec_id').agg({'date_created':'max',
|
df = df.groupby('workflow_spec_id').agg({'date_created':'max',
|
||||||
'md5_hash':join_uuids}).copy()
|
'md5_hash':join_uuids}).copy()
|
||||||
|
# get only the columns we are really interested in returning
|
||||||
df = df[['date_created','md5_hash']].copy()
|
df = df[['date_created','md5_hash']].copy()
|
||||||
|
# convert dates to string
|
||||||
df['date_created'] = df['date_created'].astype('str')
|
df['date_created'] = df['date_created'].astype('str')
|
||||||
|
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue