From 3e8d4ca7c942af25f5a3f8f0608417068462de5b Mon Sep 17 00:00:00 2001
From: Kelly McDonald <kelly@fammcdonald.net>
Date: Fri, 4 Dec 2020 10:23:03 -0500
Subject: [PATCH] Add some inline documentation to make the process more clear

---
 crc/api/workflow.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/crc/api/workflow.py b/crc/api/workflow.py
index a65f6dda..726aa78e 100644
--- a/crc/api/workflow.py
+++ b/crc/api/workflow.py
@@ -273,31 +273,50 @@ def get_changed_workflows(remote):
     gets a remote endpoint - gets the workflows and then
     determines what workflows are different from the remote endpoint
     """
-    x = requests.get('http://'+remote+'/v1.0/workflow_spec/all')
+    response = requests.get('http://'+remote+'/v1.0/workflow_spec/all')
 
     # This is probably very and may allow cross site attacks - fix later
-    remote = pd.DataFrame(eval(x.text))
+    remote = pd.DataFrame(json.loads(response.text))
+    # get the local thumbprints & make sure that 'workflow_spec_id' is a column, not an index
     local = get_all_spec_state_dataframe().reset_index()
-    different  = remote.merge(local, right_on=['workflow_spec_id','md5_hash'], left_on=['workflow_spec_id',
-                                                                                        'md5_hash'], how = 'outer' , indicator=True).loc[
-        lambda x : x['_merge']!='both']
 
+    # merge these on workflow spec id and hash - this will
+    # make two different date columns date_x and date_y
+    different  = remote.merge(local,
+                              right_on=['workflow_spec_id','md5_hash'],
+                              left_on=['workflow_spec_id','md5_hash'],
+                              how = 'outer' ,
+                              indicator=True).loc[lambda x : x['_merge']!='both']
+
+    # each line has a tag on it - if was in the left or the right,
+    # label it so we know if that was on the remote or local machine
     different.loc[different['_merge']=='left_only','location'] = 'remote'
     different.loc[different['_merge']=='right_only','location'] = 'local'
-    #changedfiles = different.copy()
+
+    # this takes the different date_created_x and date-created_y columns and
+    # combines them back into one date_created column
     index = different['date_created_x'].isnull()
     different.loc[index,'date_created_x'] = different[index]['date_created_y']
     different = different[['workflow_spec_id','date_created_x','location']].copy()
     different.columns=['workflow_spec_id','date_created','location']
+
+    # our different list will have multiple entries for a workflow if there is a version on either side
+    # we want to grab the most recent one, so we sort and grab the most recent one for each workflow
     changedfiles = different.sort_values('date_created',ascending=False).groupby('workflow_spec_id').first()
+
+    # get an exclusive or list of workflow ids - that is we want lists of files that are
+    # on one machine or the other, but not both
     remote_spec_ids = remote[['workflow_spec_id']]
     local_spec_ids = local[['workflow_spec_id']]
-
     left = remote_spec_ids[~remote_spec_ids['workflow_spec_id'].isin(local_spec_ids['workflow_spec_id'])]
     right = local_spec_ids[~local_spec_ids['workflow_spec_id'].isin(remote_spec_ids['workflow_spec_id'])]
+
+    # flag files as new that are only on the remote box and remove the files that are only on the local box
     changedfiles['new'] = False
     changedfiles.loc[changedfiles.index.isin(left['workflow_spec_id']), 'new'] = True
     output = changedfiles[~changedfiles.index.isin(right['workflow_spec_id'])]
+
+    # return the list as a dict, let swagger convert it to json
     return output.reset_index().to_dict(orient='records')
 
 
@@ -339,9 +358,9 @@ def get_all_spec_state_dataframe():
     # workflow spec
     df = df.groupby('workflow_spec_id').agg({'date_created':'max',
                                              'md5_hash':join_uuids}).copy()
+    # get only the columns we are really interested in returning
     df = df[['date_created','md5_hash']].copy()
+    # convert dates to string
     df['date_created'] = df['date_created'].astype('str')
-
-
     return df