diff --git a/crc/api/workflow.py b/crc/api/workflow.py index a65f6dda..726aa78e 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -273,31 +273,50 @@ def get_changed_workflows(remote): gets a remote endpoint - gets the workflows and then determines what workflows are different from the remote endpoint """ - x = requests.get('http://'+remote+'/v1.0/workflow_spec/all') + response = requests.get('http://'+remote+'/v1.0/workflow_spec/all') # This is probably very and may allow cross site attacks - fix later - remote = pd.DataFrame(eval(x.text)) + remote = pd.DataFrame(json.loads(response.text)) + # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index local = get_all_spec_state_dataframe().reset_index() - different = remote.merge(local, right_on=['workflow_spec_id','md5_hash'], left_on=['workflow_spec_id', - 'md5_hash'], how = 'outer' , indicator=True).loc[ - lambda x : x['_merge']!='both'] + # merge these on workflow spec id and hash - this will + # make two different date columns date_x and date_y + different = remote.merge(local, + right_on=['workflow_spec_id','md5_hash'], + left_on=['workflow_spec_id','md5_hash'], + how = 'outer' , + indicator=True).loc[lambda x : x['_merge']!='both'] + + # each line has a tag on it - if was in the left or the right, + # label it so we know if that was on the remote or local machine different.loc[different['_merge']=='left_only','location'] = 'remote' different.loc[different['_merge']=='right_only','location'] = 'local' - #changedfiles = different.copy() + + # this takes the different date_created_x and date-created_y columns and + # combines them back into one date_created column index = different['date_created_x'].isnull() different.loc[index,'date_created_x'] = different[index]['date_created_y'] different = different[['workflow_spec_id','date_created_x','location']].copy() different.columns=['workflow_spec_id','date_created','location'] + + # our different list will have multiple entries for a workflow if there is a version on either side + # we want to grab the most recent one, so we sort and grab the most recent one for each workflow changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first() + + # get an exclusive or list of workflow ids - that is we want lists of files that are + # on one machine or the other, but not both remote_spec_ids = remote[['workflow_spec_id']] local_spec_ids = local[['workflow_spec_id']] - left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])] right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])] + + # flag files as new that are only on the remote box and remove the files that are only on the local box changedfiles['new'] = False changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])] + + # return the list as a dict, let swagger convert it to json return output.reset_index().to_dict(orient='records') @@ -339,9 +358,9 @@ def get_all_spec_state_dataframe(): # workflow spec df = df.groupby('workflow_spec_id').agg({'date_created':'max', 'md5_hash':join_uuids}).copy() + # get only the columns we are really interested in returning df = df[['date_created','md5_hash']].copy() + # convert dates to string df['date_created'] = df['date_created'].astype('str') - - return df