What's this?
I used Piyolog, a Japanese parenting application to record daily child care status.
I tried processing data in it because I've collected quite a bit of data after more than 8 months of use.
Cotents
Import modules and prepare for data
I'll import some modules that are used to process data.
import numpy as np import pandas as pd import re import datetime import os import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns
Because I'm using Google Colab now, I'll upload files to Google Drive and read data from there.
from google.colab import drive drive.mount('/content/drive') path = '/content/drive/[directory path]' files = [_ for _ in os.listdir(path) if _.endswith(r'.txt')] month_texts = [] for filename in files: f = open(f'{path}/{filename}', encoding='utf-8') data = f.read() month_texts.append(data) f.close()
Process data
I'll process data to DataFrame type because raw texts like below are difficult to use.
02:15 Formula 100ml
05:30 Wake-up (11h30m)
06:50 Solid Food
# https://qiita.com/yakipudding/items/11223f12a843e4399300 # This code is based on the above link. # Minor adjustments were made to the data format for piyolog English settings. def get_piyolog_all_items(month_texts): all_items = [] for month_text in month_texts: lines = month_text.splitlines() array = np.array(lines) for index, item in enumerate(array): if item == '----------' and index < len(array) - 1: day = array[index + 1][5:] day_date = datetime.datetime.strptime(day, "%b %d, %Y") if item != '' and check_item(item): record = item.split() record_dt = datetime.datetime.strptime(day + ' ' + record[0], '%b %d, %Y %H:%M') record_type = None record_subtype = record[1] record_value = None record_timespan = None if 'Formula' in record_subtype: record_type = 'Food' record_timespan = 15 record_value = int(record[2].replace('ml', '')) elif 'Nursing' in record_subtype: record_type = 'Food' record_time = 0 for r in record[2:]: if 'm' in r: record_time += int(r.replace('m','')) record_timespan = record_time elif 'Poop' in record_subtype or 'Pee' in record_subtype: record_type = 'Waste' elif 'Expressed' in record_subtype: record_type = 'Food' record_subtype = 'Expressed' record_timespan = 15 record_value = int(record[4].replace('ml', '')) else: record_type = 'Other' all_items.append([day_date, record_dt, record_type, record_subtype, record_timespan, record_value]) df = pd.DataFrame(all_items, columns=['date', 'datetime', 'category', 'item', 'time', 'volume']) def replace_time_zone(x): if x.datetime.hour >= 6 and x.datetime.hour <= 17: return 'daytime' else: return 'night' df['time_zone'] = df.apply(lambda x:replace_time_zone(x),axis=1) return df def check_item(text): if re.findall('Formula|Nursing|Expressed|Pee|Poop|Vomit|Body|Walks|Baths|Others|Solid', text) and re.match(r'([01][0-9]|2[0-3]):[0-5][0-9]', text): return True return False
Then, I can see the DataFrame like this.
df = get_piyolog_all_items(month_texts) df.head()
(a part of the output)
Next, I'll convert it to the DataFrame that is grouped by day.
df_milk = df.query('item=="Formula"').groupby('date').agg({'datetime':'count', 'volume':'sum', 'time':'sum'}).reset_index() df_milk.columns = ['date','formula_count', 'formula_volume', 'formula_time'] df_mother_milk = df.query('item=="Nursing"').groupby('date').agg({'datetime':'count', 'time':'sum'}).reset_index() df_mother_milk.columns = ['date','nursing_count', 'nursing_time'] df_waste = df.query('category=="Waste"').groupby('date').agg({'datetime':'count'}).reset_index() df_waste.columns = ['date','waste_count'] df_poop = df.query('item=="Poop"').groupby('date').agg({'datetime':'count'}).reset_index() df_poop.columns = ['date','poop_count'] df_expressed = df.query('item=="Expressed"').groupby('date').agg({'datetime':'count', 'volume':'sum' ,'time':'sum'}).reset_index() df_expressed.columns = ['date', 'expressed_milk_count', 'expressed_milk', 'expressed_milk_time'] df_night_work = df.query('time_zone=="night"').groupby('date').agg({'datetime':'count'}).reset_index() df_night_work.columns = ['date', 'night_work_count'] df_other_count = df.query('category=="Other"').groupby('date').agg({'datetime':'count'}).reset_index() df_other_count.columns = ['date', 'other_count'] df_groupby_day = pd.merge(df_milk, df_mother_milk, on='date', how='left') df_groupby_day = pd.merge(df_groupby_day, df_waste, on='date', how='left') df_groupby_day = pd.merge(df_groupby_day, df_poop, on='date', how='left') df_groupby_day = pd.merge(df_groupby_day, df_expressed, on='date', how='left') df_groupby_day = pd.merge(df_groupby_day, df_night_work, on='date', how='left') df_groupby_day = pd.merge(df_groupby_day, df_other_count, on='date', how='left') df_groupby_day = df_groupby_day.fillna(0) df_groupby_day['formula_volume_per_count'] = df_groupby_day['formula_volume']/df_groupby_day['formula_count'] df_groupby_day['nursing_time_per_count'] = df_groupby_day['nursing_time']/df_groupby_day['nursing_count'] df_groupby_day['total_work_count'] = df_groupby_day['formula_count'] + df_groupby_day['nursing_count'] + df_groupby_day['waste_count'] + df_groupby_day['expressed_milk_count'] + df_groupby_day['other_count'] df_groupby_day.head()
(a part of the output)
Visualize
I'll visualize the data in a time series to see cahnges.
fig1,(ax1)=plt.subplots(1,1,figsize=(18,5)) ax1.set_title('total_work_count') sns.lineplot(x="date", y="total_work_count", data=df_groupby_day, ax=ax1) sns.lineplot(x="date", y="night_work_count", data=df_groupby_day, ax=ax1) ax1.legend(["total_work", "night_work"]) fig2,(ax2, ax3, ax4)=plt.subplots(1,3,figsize=(18,5)) ax2.set_title('formula&nursing_count') sns.lineplot(x="date", y="formula_count", data=df_groupby_day, ax=ax2) sns.lineplot(x="date", y="nursing_count", data=df_groupby_day, ax=ax2) ax2.legend(["formula", "nursing"]) ax3.set_title('formula_volume') sns.lineplot(x="date", y="formula_volume", data=df_groupby_day, ax=ax3) sns.lineplot(x="date", y="formula_volume_per_count", data=df_groupby_day, ax=ax3) ax3.legend(["volume", "volume_per_count"]) ax4.set_title('nursing_time') sns.lineplot(x="date", y="nursing_time", data=df_groupby_day, ax=ax4) sns.lineplot(x="date", y="nursing_time_per_count", data=df_groupby_day, ax=ax4) ax4.legend(["time", "time_per_count"]) fig3,(ax5, ax6, ax7)=plt.subplots(1,3,figsize=(18,5)) ax5.set_title('expressed_milk_count') sns.lineplot(x="date", y="expressed_milk_count", data=df_groupby_day, ax=ax5) ax6.set_title('waste_count') sns.lineplot(x="date", y="waste_count", data=df_groupby_day, ax=ax6) sns.lineplot(x="date", y="poop_count", data=df_groupby_day, ax=ax6) ax6.legend(["waste", "poop"]) ax7.set_title('items_inclued_in_others') sub_df = df.query('category=="Other"')['item'].value_counts() sub_df = pd.DataFrame(sub_df) sizes = sub_df['item'] labels = sub_df.index.tolist() ax7.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=0)
(a part of the output)
Impression on Implementation
- I must review the code because I've realized that I forgot the basic Python modules.