Build my own Hello World

It's just a personal note.

I've tried processing data in Piyolog, a Japanese parenting application, and doing a brief analyze.

What's this?

I used Piyolog, a Japanese parenting application to record daily child care status.

www.piyolog.com

I tried processing data in it because I've collected quite a bit of data after more than 8 months of use.

Cotents

Import modules and prepare for data

I'll import some modules that are used to process data.

import numpy as np
import pandas as pd
import re
import datetime
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Because I'm using Google Colab now, I'll upload files to Google Drive and read data from there.

from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/[directory path]'

files = [_ for _ in os.listdir(path) if _.endswith(r'.txt')]
month_texts = []
for filename in files:
    f = open(f'{path}/{filename}', encoding='utf-8')
    data = f.read()
    month_texts.append(data)
    f.close()

Process data

I'll process data to DataFrame type because raw texts like below are difficult to use.

02:15 Formula 100ml
05:30 Wake-up (11h30m)
06:50 Solid Food

# https://qiita.com/yakipudding/items/11223f12a843e4399300
# This code is based on the above link.
# Minor adjustments were made to the data format for piyolog English settings.

def get_piyolog_all_items(month_texts):
    all_items = []

    for month_text in month_texts:
      lines = month_text.splitlines()
      array = np.array(lines)

      for index, item in enumerate(array):
          if item == '----------' and index < len(array) - 1:
              day = array[index + 1][5:]
              day_date = datetime.datetime.strptime(day, "%b %d, %Y")

          if item != '' and check_item(item):
              record = item.split()

              record_dt = datetime.datetime.strptime(day + ' ' + record[0], '%b %d, %Y %H:%M')
              record_type = None
              record_subtype = record[1]
              record_value = None
              record_timespan = None

              if 'Formula' in record_subtype:
                  record_type = 'Food'
                  record_timespan = 15
                  record_value = int(record[2].replace('ml', ''))

              elif 'Nursing' in record_subtype:
                  record_type = 'Food'
                  record_time = 0
                  for r in record[2:]:
                      if 'm' in r:
                          record_time += int(r.replace('m',''))
                  record_timespan = record_time

              elif 'Poop' in record_subtype or 'Pee' in record_subtype:
                record_type = 'Waste'

      
              elif 'Expressed' in record_subtype:
                  record_type = 'Food'
                  record_subtype = 'Expressed'
                  record_timespan = 15
                  record_value = int(record[4].replace('ml', ''))
                
              else:
                  record_type = 'Other'

              all_items.append([day_date, record_dt, record_type, record_subtype, record_timespan, record_value])

    df = pd.DataFrame(all_items, columns=['date', 'datetime', 'category', 'item', 'time', 'volume'])

    def replace_time_zone(x):
      if x.datetime.hour >= 6 and x.datetime.hour <= 17:
          return 'daytime'
      else:
          return 'night'
        
    df['time_zone'] = df.apply(lambda x:replace_time_zone(x),axis=1)
    return df


def check_item(text):
    if re.findall('Formula|Nursing|Expressed|Pee|Poop|Vomit|Body|Walks|Baths|Others|Solid', text) and re.match(r'([01][0-9]|2[0-3]):[0-5][0-9]', text):
        return True
    return False

Then, I can see the DataFrame like this.

df = get_piyolog_all_items(month_texts)

df.head()

(a part of the output)

Next, I'll convert it to the DataFrame that is grouped by day.

df_milk = df.query('item=="Formula"').groupby('date').agg({'datetime':'count', 'volume':'sum', 'time':'sum'}).reset_index()
df_milk.columns = ['date','formula_count', 'formula_volume', 'formula_time']
df_mother_milk = df.query('item=="Nursing"').groupby('date').agg({'datetime':'count', 'time':'sum'}).reset_index()
df_mother_milk.columns = ['date','nursing_count', 'nursing_time']
df_waste = df.query('category=="Waste"').groupby('date').agg({'datetime':'count'}).reset_index()
df_waste.columns = ['date','waste_count']
df_poop = df.query('item=="Poop"').groupby('date').agg({'datetime':'count'}).reset_index()
df_poop.columns = ['date','poop_count']
df_expressed = df.query('item=="Expressed"').groupby('date').agg({'datetime':'count', 'volume':'sum' ,'time':'sum'}).reset_index()
df_expressed.columns = ['date', 'expressed_milk_count', 'expressed_milk', 'expressed_milk_time']
df_night_work = df.query('time_zone=="night"').groupby('date').agg({'datetime':'count'}).reset_index()
df_night_work.columns = ['date', 'night_work_count']
df_other_count = df.query('category=="Other"').groupby('date').agg({'datetime':'count'}).reset_index()
df_other_count.columns = ['date', 'other_count']

df_groupby_day = pd.merge(df_milk, df_mother_milk, on='date', how='left')
df_groupby_day = pd.merge(df_groupby_day, df_waste, on='date', how='left')
df_groupby_day = pd.merge(df_groupby_day, df_poop, on='date', how='left')
df_groupby_day = pd.merge(df_groupby_day, df_expressed, on='date', how='left')
df_groupby_day = pd.merge(df_groupby_day, df_night_work, on='date', how='left')
df_groupby_day = pd.merge(df_groupby_day, df_other_count, on='date', how='left')


df_groupby_day = df_groupby_day.fillna(0)

df_groupby_day['formula_volume_per_count'] = df_groupby_day['formula_volume']/df_groupby_day['formula_count']
df_groupby_day['nursing_time_per_count'] = df_groupby_day['nursing_time']/df_groupby_day['nursing_count']
df_groupby_day['total_work_count'] = df_groupby_day['formula_count'] + df_groupby_day['nursing_count'] + df_groupby_day['waste_count'] + df_groupby_day['expressed_milk_count'] + df_groupby_day['other_count']

df_groupby_day.head()

(a part of the output)

Visualize

I'll visualize the data in a time series to see cahnges.

fig1,(ax1)=plt.subplots(1,1,figsize=(18,5))
ax1.set_title('total_work_count')
sns.lineplot(x="date", y="total_work_count", data=df_groupby_day, ax=ax1)
sns.lineplot(x="date", y="night_work_count", data=df_groupby_day, ax=ax1)
ax1.legend(["total_work", "night_work"])

fig2,(ax2, ax3, ax4)=plt.subplots(1,3,figsize=(18,5))
ax2.set_title('formula&nursing_count')
sns.lineplot(x="date", y="formula_count", data=df_groupby_day, ax=ax2)
sns.lineplot(x="date", y="nursing_count", data=df_groupby_day, ax=ax2)
ax2.legend(["formula", "nursing"])

ax3.set_title('formula_volume')
sns.lineplot(x="date", y="formula_volume", data=df_groupby_day, ax=ax3)
sns.lineplot(x="date", y="formula_volume_per_count", data=df_groupby_day, ax=ax3)
ax3.legend(["volume", "volume_per_count"])

ax4.set_title('nursing_time')
sns.lineplot(x="date", y="nursing_time", data=df_groupby_day, ax=ax4)
sns.lineplot(x="date", y="nursing_time_per_count", data=df_groupby_day, ax=ax4)
ax4.legend(["time", "time_per_count"])

fig3,(ax5, ax6, ax7)=plt.subplots(1,3,figsize=(18,5))
ax5.set_title('expressed_milk_count')
sns.lineplot(x="date", y="expressed_milk_count", data=df_groupby_day, ax=ax5)

ax6.set_title('waste_count')
sns.lineplot(x="date", y="waste_count", data=df_groupby_day, ax=ax6)
sns.lineplot(x="date", y="poop_count", data=df_groupby_day, ax=ax6)
ax6.legend(["waste", "poop"])

ax7.set_title('items_inclued_in_others')
sub_df = df.query('category=="Other"')['item'].value_counts()
sub_df = pd.DataFrame(sub_df)
sizes = sub_df['item']
labels =  sub_df.index.tolist()
ax7.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=0)

(a part of the output)

Impression on Implementation

  • I must review the code because I've realized that I forgot the basic Python modules.