diff --git a/graph.py b/graph.py index 18fb788..309b167 100644 --- a/graph.py +++ b/graph.py @@ -13,7 +13,7 @@ class PDGraphPeers(): def unique_peers_counts(self): return self.df.groupby(['Peer'])['Date'].nunique() - def number_of_days(self, exclude=20): + def days_per_peers(self, exclude=20): nu_peers = self.unique_peers_counts() ex_twenty_day = nu_peers[nu_peers > exclude] ax = sns.distplot(ex_twenty_day, kde=False, hist=True) @@ -22,4 +22,47 @@ class PDGraphPeers(): xlabel='# of days', ylabel='# of peers' ) - return ax.get_figure() + return ax + + def weekly_cohorts(self): + self.df['datetime'] = pd.to_datetime(self.df['Date']) + self.df['week'] = self.df['datetime'].dt.to_period('W') + self.df['month'] = self.df['datetime'].dt.to_period('M') + self.df['cohort'] = self.df.groupby('Peer')['datetime'].transform('min').dt.to_period('W') + df_cohort = self.df.groupby(['cohort', 'week']).agg(n_peers=('Peer', 'nunique')).reset_index(drop=False) + + df_cohort['period_number'] = ( + df_cohort.week - df_cohort.cohort).apply(attrgetter('n') + ) + cohort_pivot = df_cohort.pivot_table( + index = 'cohort', + columns = 'period_number', + values = 'n_peers' + ) + cohort_size = cohort_pivot.iloc[:,0] + retention_matrix = cohort_pivot.divide(cohort_size, axis = 0) + + fig, ax = plt.subplots(1, 2, figsize=(12, 8), sharey=True, gridspec_kw={'width_ratios': [1, 11]}) + + # retention matrix + sns.heatmap(retention_matrix, + mask=retention_matrix.isnull(), + annot=True, + fmt='.0%', + cmap='RdYlGn', + ax=ax[1]) + ax[1].set_title('Weekly Cohorts: Peer Retention', fontsize=16) + ax[1].set(xlabel='# of periods', + ylabel='') + + # cohort size + cohort_size_df = pd.DataFrame(cohort_size).rename(columns={0: 'cohort_size'}) + white_cmap = mcolors.ListedColormap(['white']) + + fig.tight_layout() + return sns.heatmap(cohort_size_df, + annot=True, + cbar=False, + fmt='g', + cmap=white_cmap, + ax=ax[0]) diff --git a/main.py b/main.py index 627a7a7..e6c120b 100755 --- a/main.py +++ b/main.py @@ -38,9 +38,11 @@ def main(): pdg = PDGraphPeers(data) - print(pdg.unique_peers_counts()) - plot = pdg.number_of_days() - plot.savefig("output.png") + plot = pdg.days_per_peers() + plot.figure.savefig("days_per_peers.png") + + matrix = pdg.weekly_cohorts() + matrix.figure.savefig("weekly_cohorts.png") if __name__ == '__main__': main() diff --git a/output.png b/output.png deleted file mode 100644 index 5f2c8b6..0000000 Binary files a/output.png and /dev/null differ