#Introduction 

###This is a capstone project for google data analytics course, 

###Adeeba Amreen 

###Date 16th January 2024

#Objective 

###Introducing marketing strategies for bellabeat fitbit devices for the customers 
###who use the smart devices. 


###I'm going to analyse the data based on correlation activity level, sleep and calories burnt.

#Description of the data used 

#I would like to express my gratitude to Möbius for supplying the pertinent dataset needed to study the use of smart wellness devices and their trends.
#License: CC0: Public Domain
#Source: https://zenodo.org/record/53894#.X9oeh3Uzaao
#Privacy: These datasets were generated by respondents to a distributed survey via Amazon Mechanical Turk between 03.12.2016-05.12.2016. Thirty eligible Fitbit users consented to the submission of personal tracker data, including minute-level output for physical activity, heart rate, and sleep monitoring.


library('tidyverse')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library('janitor')
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library('skimr')
library('here')
## here() starts at /cloud/project
library('dplyr')
library(lubridate)
library(ggplot2)

##Uploading the required datasets 

library(readr)
dailyActivity_merged <- read_csv("dailyActivity_merged.csv")
## Rows: 940 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sleepDay_merged <- read_csv("sleepDay_merged.csv")
## Rows: 413 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): SleepDay
## dbl (4): Id, TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##Summary of the dataset

head(dailyActivity_merged)
## # A tibble: 6 × 15
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
##        <dbl> <chr>             <dbl>         <dbl>           <dbl>
## 1 1503960366 4/12/2016         13162          8.5             8.5 
## 2 1503960366 4/13/2016         10735          6.97            6.97
## 3 1503960366 4/14/2016         10460          6.74            6.74
## 4 1503960366 4/15/2016          9762          6.28            6.28
## 5 1503960366 4/16/2016         12669          8.16            8.16
## 6 1503960366 4/17/2016          9705          6.48            6.48
## # ℹ 10 more variables: LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>
head(sleepDay_merged)
## # A tibble: 6 × 5
##           Id SleepDay        TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
##        <dbl> <chr>                       <dbl>              <dbl>          <dbl>
## 1 1503960366 4/12/2016 12:0…                 1                327            346
## 2 1503960366 4/13/2016 12:0…                 2                384            407
## 3 1503960366 4/15/2016 12:0…                 1                412            442
## 4 1503960366 4/16/2016 12:0…                 2                340            367
## 5 1503960366 4/17/2016 12:0…                 1                700            712
## 6 1503960366 4/19/2016 12:0…                 1                304            320
str(dailyActivity_merged)
## spc_tbl_ [940 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id                      : num [1:940] 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ ActivityDate            : chr [1:940] "4/12/2016" "4/13/2016" "4/14/2016" "4/15/2016" ...
##  $ TotalSteps              : num [1:940] 13162 10735 10460 9762 12669 ...
##  $ TotalDistance           : num [1:940] 8.5 6.97 6.74 6.28 8.16 ...
##  $ TrackerDistance         : num [1:940] 8.5 6.97 6.74 6.28 8.16 ...
##  $ LoggedActivitiesDistance: num [1:940] 0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveDistance      : num [1:940] 1.88 1.57 2.44 2.14 2.71 ...
##  $ ModeratelyActiveDistance: num [1:940] 0.55 0.69 0.4 1.26 0.41 ...
##  $ LightActiveDistance     : num [1:940] 6.06 4.71 3.91 2.83 5.04 ...
##  $ SedentaryActiveDistance : num [1:940] 0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveMinutes       : num [1:940] 25 21 30 29 36 38 42 50 28 19 ...
##  $ FairlyActiveMinutes     : num [1:940] 13 19 11 34 10 20 16 31 12 8 ...
##  $ LightlyActiveMinutes    : num [1:940] 328 217 181 209 221 164 233 264 205 211 ...
##  $ SedentaryMinutes        : num [1:940] 728 776 1218 726 773 ...
##  $ Calories                : num [1:940] 1985 1797 1776 1745 1863 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   ActivityDate = col_character(),
##   ..   TotalSteps = col_double(),
##   ..   TotalDistance = col_double(),
##   ..   TrackerDistance = col_double(),
##   ..   LoggedActivitiesDistance = col_double(),
##   ..   VeryActiveDistance = col_double(),
##   ..   ModeratelyActiveDistance = col_double(),
##   ..   LightActiveDistance = col_double(),
##   ..   SedentaryActiveDistance = col_double(),
##   ..   VeryActiveMinutes = col_double(),
##   ..   FairlyActiveMinutes = col_double(),
##   ..   LightlyActiveMinutes = col_double(),
##   ..   SedentaryMinutes = col_double(),
##   ..   Calories = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(sleepDay_merged)
## spc_tbl_ [413 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id                : num [1:413] 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ SleepDay          : chr [1:413] "4/12/2016 12:00:00 AM" "4/13/2016 12:00:00 AM" "4/15/2016 12:00:00 AM" "4/16/2016 12:00:00 AM" ...
##  $ TotalSleepRecords : num [1:413] 1 2 1 2 1 1 1 1 1 1 ...
##  $ TotalMinutesAsleep: num [1:413] 327 384 412 340 700 304 360 325 361 430 ...
##  $ TotalTimeInBed    : num [1:413] 346 407 442 367 712 320 377 364 384 449 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   SleepDay = col_character(),
##   ..   TotalSleepRecords = col_double(),
##   ..   TotalMinutesAsleep = col_double(),
##   ..   TotalTimeInBed = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#Cleaning dates, converting the activity date and Sleep day 


dailyActivity_merged$ActivityDate <- as.Date(dailyActivity_merged$ActivityDate, "%m/%d/%Y")
sleepDay_merged$SleepDay <- parse_date_time(sleepDay_merged$SleepDay, orders = 'mdy HMS')
sleepDay_merged$SleepDay <- as.Date(sleepDay_merged$SleepDay, "%m/%d/%y %h:%m:%s")

#Merging the datasets of activity and sleep timing using left join and replacing NA with zero 

daily_activity_sleep <- merge(x= dailyActivity_merged, y= sleepDay_merged,
                              by.x = c("Id", "ActivityDate"), by.y = c("Id", "SleepDay"), all.x = TRUE)
daily_activity_sleep [is.na(daily_activity_sleep)] <- 0


head(daily_activity_sleep)
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366   2016-04-12      13162          8.50            8.50
## 2 1503960366   2016-04-13      10735          6.97            6.97
## 3 1503960366   2016-04-14      10460          6.74            6.74
## 4 1503960366   2016-04-15       9762          6.28            6.28
## 5 1503960366   2016-04-16      12669          8.16            8.16
## 6 1503960366   2016-04-17       9705          6.48            6.48
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.44                     0.40
## 4                        0               2.14                     1.26
## 5                        0               2.71                     0.41
## 6                        0               3.19                     0.78
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                3.91                       0                30
## 4                2.83                       0                29
## 5                5.04                       0                36
## 6                2.51                       0                38
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1                  13                  328              728     1985
## 2                  19                  217              776     1797
## 3                  11                  181             1218     1776
## 4                  34                  209              726     1745
## 5                  10                  221              773     1863
## 6                  20                  164              539     1728
##   TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## 1                 1                327            346
## 2                 2                384            407
## 3                 0                  0              0
## 4                 1                412            442
## 5                 2                340            367
## 6                 1                700            712
#Summary of merged data set
summary(daily_activity_sleep)
##        Id             ActivityDate          TotalSteps    TotalDistance   
##  Min.   :1.504e+09   Min.   :2016-04-12   Min.   :    0   Min.   : 0.000  
##  1st Qu.:2.320e+09   1st Qu.:2016-04-19   1st Qu.: 3795   1st Qu.: 2.620  
##  Median :4.445e+09   Median :2016-04-26   Median : 7439   Median : 5.260  
##  Mean   :4.858e+09   Mean   :2016-04-26   Mean   : 7652   Mean   : 5.503  
##  3rd Qu.:6.962e+09   3rd Qu.:2016-05-04   3rd Qu.:10734   3rd Qu.: 7.720  
##  Max.   :8.878e+09   Max.   :2016-05-12   Max.   :36019   Max.   :28.030  
##  TrackerDistance  LoggedActivitiesDistance VeryActiveDistance
##  Min.   : 0.000   Min.   :0.000            Min.   : 0.000    
##  1st Qu.: 2.620   1st Qu.:0.000            1st Qu.: 0.000    
##  Median : 5.260   Median :0.000            Median : 0.220    
##  Mean   : 5.489   Mean   :0.110            Mean   : 1.504    
##  3rd Qu.: 7.715   3rd Qu.:0.000            3rd Qu.: 2.065    
##  Max.   :28.030   Max.   :4.942            Max.   :21.920    
##  ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
##  Min.   :0.0000           Min.   : 0.000      Min.   :0.000000       
##  1st Qu.:0.0000           1st Qu.: 1.950      1st Qu.:0.000000       
##  Median :0.2400           Median : 3.380      Median :0.000000       
##  Mean   :0.5709           Mean   : 3.349      Mean   :0.001601       
##  3rd Qu.:0.8050           3rd Qu.: 4.790      3rd Qu.:0.000000       
##  Max.   :6.4800           Max.   :10.710      Max.   :0.110000       
##  VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
##  Min.   :  0.00    Min.   :  0.00      Min.   :  0          Min.   :   0.0  
##  1st Qu.:  0.00    1st Qu.:  0.00      1st Qu.:127          1st Qu.: 729.0  
##  Median :  4.00    Median :  7.00      Median :199          Median :1057.0  
##  Mean   : 21.24    Mean   : 13.63      Mean   :193          Mean   : 990.4  
##  3rd Qu.: 32.00    3rd Qu.: 19.00      3rd Qu.:264          3rd Qu.:1229.0  
##  Max.   :210.00    Max.   :143.00      Max.   :518          Max.   :1440.0  
##     Calories    TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
##  Min.   :   0   Min.   :0.0000    Min.   :  0.0      Min.   :  0.0  
##  1st Qu.:1830   1st Qu.:0.0000    1st Qu.:  0.0      1st Qu.:  0.0  
##  Median :2140   Median :0.0000    Median :  0.0      Median :  0.0  
##  Mean   :2308   Mean   :0.4899    Mean   :183.7      Mean   :200.9  
##  3rd Qu.:2796   3rd Qu.:1.0000    3rd Qu.:417.5      3rd Qu.:450.5  
##  Max.   :4900   Max.   :3.0000    Max.   :796.0      Max.   :961.0
#Data Analysis 


#Merge Data Frames

#We are utilizing a left join to combine the data while combining two data frames because of a discrepancy in the number of observations. Every observation that does not match will display NA, which is substituted with a zero in this case.

daily_activity_sleep <- merge(x= dailyActivity_merged, y= sleepDay_merged,
                              by.x = c("Id", "ActivityDate"), by.y = c("Id", "SleepDay"), all.x = TRUE)
daily_activity_sleep [is.na(daily_activity_sleep)] <- 0
head(daily_activity_sleep)
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366   2016-04-12      13162          8.50            8.50
## 2 1503960366   2016-04-13      10735          6.97            6.97
## 3 1503960366   2016-04-14      10460          6.74            6.74
## 4 1503960366   2016-04-15       9762          6.28            6.28
## 5 1503960366   2016-04-16      12669          8.16            8.16
## 6 1503960366   2016-04-17       9705          6.48            6.48
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.44                     0.40
## 4                        0               2.14                     1.26
## 5                        0               2.71                     0.41
## 6                        0               3.19                     0.78
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                3.91                       0                30
## 4                2.83                       0                29
## 5                5.04                       0                36
## 6                2.51                       0                38
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1                  13                  328              728     1985
## 2                  19                  217              776     1797
## 3                  11                  181             1218     1776
## 4                  34                  209              726     1745
## 5                  10                  221              773     1863
## 6                  20                  164              539     1728
##   TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## 1                 1                327            346
## 2                 2                384            407
## 3                 0                  0              0
## 4                 1                412            442
## 5                 2                340            367
## 6                 1                700            712
# Create Categories
#Based on sleep, distance and calories burnt 

daily_activity_sleep <- daily_activity_sleep %>% 
  mutate(sleep_categories = case_when(
    TotalMinutesAsleep >360 & TotalMinutesAsleep <= 480 ~ "6h-8h",
    TotalMinutesAsleep > 480 ~ "> 8h",
    TRUE ~ "< 6h"
  )) %>% 
  mutate(calorie_categories = case_when(
    Calories > 1500 & Calories <= 2500 ~ "1.5k-2.5k",
    Calories > 2500 ~ "> 2.5k",
    TRUE ~ "< 1.5k"
  ))
  

head(daily_activity_sleep)
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366   2016-04-12      13162          8.50            8.50
## 2 1503960366   2016-04-13      10735          6.97            6.97
## 3 1503960366   2016-04-14      10460          6.74            6.74
## 4 1503960366   2016-04-15       9762          6.28            6.28
## 5 1503960366   2016-04-16      12669          8.16            8.16
## 6 1503960366   2016-04-17       9705          6.48            6.48
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.44                     0.40
## 4                        0               2.14                     1.26
## 5                        0               2.71                     0.41
## 6                        0               3.19                     0.78
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                3.91                       0                30
## 4                2.83                       0                29
## 5                5.04                       0                36
## 6                2.51                       0                38
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1                  13                  328              728     1985
## 2                  19                  217              776     1797
## 3                  11                  181             1218     1776
## 4                  34                  209              726     1745
## 5                  10                  221              773     1863
## 6                  20                  164              539     1728
##   TotalSleepRecords TotalMinutesAsleep TotalTimeInBed sleep_categories
## 1                 1                327            346             < 6h
## 2                 2                384            407            6h-8h
## 3                 0                  0              0             < 6h
## 4                 1                412            442            6h-8h
## 5                 2                340            367             < 6h
## 6                 1                700            712             > 8h
##   calorie_categories
## 1          1.5k-2.5k
## 2          1.5k-2.5k
## 3          1.5k-2.5k
## 4          1.5k-2.5k
## 5          1.5k-2.5k
## 6          1.5k-2.5k
#Creating visualization

ggplot(data= daily_activity_sleep) +
  geom_boxplot(mapping= aes(x=sleep_categories, y= Calories, fill= sleep_categories))+ggtitle("Sleep VS Calories Burnt")

ggplot(data= daily_activity_sleep) +
  geom_boxplot(mapping= aes(x=calorie_categories, y= TotalSteps, fill= calorie_categories))+ggtitle("Total Steps VS Calories Burnt")

#Summary of Data Analysis 

### 1. There is a correlation between number of sleep hours and the calories burnt. 
### 2. There is a correlation between number of steps and the calories burnt. 
### 3. It is evident from the box plot that people who sleep 6-8 hours burn relatively more calories than people who sleep less than 6 hours and more than 8 hours 
### 4. It is also evident that number of step taken effects the calories burnt, people who have average of 15000 steps have burnt more than 2.5k calories. The key finding steps are also significant other than distance. 


#Business Recommendations 

### There is a strong correlation between proper number of hours of sleep and calories, so it is important for the people to maintain the 6-8hrs of sleep, not more or less. 
### Number of steps are significant in calories reduction, so it is essential for people to track the number of steps for better fitness tracking. 
### The marketing strategies, can be made to inform customers about how efficiently the bellabeat products can track and help the cutomers to follow the healthy lifestyle, with convenience, with just wearing a bracelet with no hassle.