(과제) full code


setwd('/Users/dayeong/DeskTop/대학/22-1/비정형데이터분석/data/A_DeviceMotion_data')

library(stringr)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(seewave)
library(signal)
library(pracma)
library(fBasics)
library(rJava)
library(RWeka)
library(changepoint)

Sys.setenv(JAVA_HOME = '/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home')

RF <- make_Weka_classifier('weka/classifiers/trees/RandomForest')
Bayes_net <- make_Weka_classifier('weka/classifiers/bayes/BayesNet')

# Data loading
for(f in fls) {
  a <- file.path(str_c(d, '/',f))
  temp <- read.csv(a)
  assign(f, temp)
}

# 전체 데이터 만들기 
HAR_total <- data.frame() 

for(f in fls) {
  temp <- get(f)
  HAR_total <- rbind(
    HAR_total, temp %>% mutate(exp_no = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[1],
                               id=unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[2],
                               activity=unlist(str_split(f, '\\\\_'))[1]))
}

# 가속도, 회전에 대해 magnitude 변수 만들기 
HAR_total <- mag(HAR_total, 'userAcceleration')
HAR_total <- mag(HAR_total, 'rotationRate')

# skewness 함수 
skewness <- function(x) {
  (sum((x-mean(x))^3/length(x))/((sum(x-mean(x))^2)/length(x)))^(3/2)
}
# rms 
rss <- function(x) rms(x) * (length(x))^0.5

HAR_summary_extend <- HAR_total %>% group_by(id, exp_no, activity) %>%
  summarize_at(.vars = c('maguserAcceleration', 'magrotationRate'), 
               .funs = c(mean, min, max, sd, skewness, rms, rss, IQR, e1071::kurtosis, Entropy))

# 7. 피크분석 - 피크 관련 통계량 
Peak_rslt <- data.frame()

# 원본 데이터에 센서 크기 변수를 추가하기 
for (d in fls) {
  f <- get(d)
  f <- mag(f, 'rotationRate')
  f <- mag(f, 'userAcceleration')
  assign(d,f)
}

for(d in fls) {
  f <- get(d) # 객체로 인식하게 함 
  p <- findpeaks(f$magrotationRate, threshold=4) # 자이로 센서 중 피크 4 이상의 값을 피크로 도출
  Peak_rslt <- rbind(Peak_rslt, data.frame(d,    # fls에 있는 파일별로 데이터 프레을 생성한 후, 행에 추가 
                                           f_n = ifelse(!is.null(p), dim(p)[1],0),        # 피크 도출 결과 p가 null이 아니면 dim(p)[1]: 행의 개수 추출
                                           p_interval = ifelse(!is.null(p), ifelse(dim(p)[1]>1, mean(diff(sort(p[,2]))),0),0), # 행이 두개 이상이면 diff 함수로 행의 차이를 구함
                                           p_interval_std = ifelse(!is.null(p), ifelse(dim(p)[1]>2, std(diff(sort(p[,2]))),0),0), # 피크 간격의 표준편차 
                                           p_mean = ifelse(!is.null(p), mean(p[,1]),0),
                                           p_max = ifelse(!is.null(p), max(p[,1]),0),
                                           p_min = ifelse(!is.null(p), min(p[,1]),0),
                                           p_std = ifelse(!is.null(p), std(p[,1]),0)))
}

# 전체 파고율 구하기 
temp <- data.frame()

for(d in fls) {
  f <- get(d) # d에 담긴 객체 반환
  f <- f %>% select(magrotationRate, maguserAcceleration) # 자이로, 가속도 센서
  cfR <- crest(f$magrotationRate, 50, plot=TRUE) # Hz: 1초에 50번을 찍어서 기록한 데이터 
  cfA <- crest(f$maguserAcceleration, 50, plot=TRUE)
  temp <- rbind(temp, data.frame(d, cfR=cfR$C, cfA=cfA$C)) # C > 파고율 key value를 뽑아 행으로 추가 
}

# temp data.frame <- {d(파일 이름), cfR key, cfA key 값}
# merge(피크 통계, 파고율)
Peak_final <- merge(Peak_rslt, temp, by='d') # 'd'를 기준으로 조인 

# 추가 변수 생성 
# id_f 적용부분을 for 문이 아닌 apply 함수로 작성하기 
id_f_exp_no <- function(x) {
  exp_no=unlist(regmatches(x, gregexpr('[[:digit:]]+', x)[1]))[1] 
  exp_no
}

id_f_id <- function(x) {
  id=unlist(regmatches(x, gregexpr('[[:digit:]]+',x)[1]))[2]
  id
}

id_f_activity <- function(x) {
  activity=unlist(str_split(x, '\\\\_'))[1]
  activity
}

# 피크 + 파고율 
Peak_final2 <- data.frame(Peak_final)
Peak_final2$exp_no = unlist(lapply(Peak_final$d, id_f_exp_no))
Peak_final2$id = unlist(lapply(Peak_final$d, id_f_id))
Peak_final2$activity = unlist(lapply(Peak_final$d, id_f_activity))

# 변화 분석 결과로부터 변수화하기 
# > 변화 횟수 변수화 전체코드 
ch_pt <- data.frame()

for(d in fls) {
  f <- get(d)
  f <- mag(f, 'rotationRate') # 가속도, 자이로 -> 센서의 크기를 하나의 값으로 이용
  f <- mag(f, 'userAcceleration')
  # 평균의 변화를 감지하도록 두개 변수를 선택한 후에 각각에 대해서 'cpt.mean 적용' 
  rslt <- sapply(f %>% select(magrotationRate, maguserAcceleration), cpt.mean) 
  
  # 결과 rslt에서 '변화시점' 도출하기 
  rslt_cpts1 <- cpts(rslt$magrotationRate)
  rslt_cpts2 <- cpts(rslt$maguserAcceleration)
  
  # cpt.var, cpt.meanvar 적용
  rslt2 <- sapply(f %>% select(magrotationRate, maguserAcceleration), cpt.var) 
  rslt2_cpts1 <- cpts(rslt2$magrotationRate)
  rslt2_cpts2 <- cpts(rslt2$maguserAcceleration)
  rslt3 <- sapply(f %>% select(magrotationRate, maguserAcceleration), cpt.meanvar)
  
  # cpt.var, cptmeanvar에서 변화시점 도출 
  rslt3_cpts1 <- cpts(rslt3$magrotationRate)
  rslt3_cpts2 <- cpts(rslt3$maguserAcceleration)
  
  # 변화 발생 건수를 도출화 
  ch_pt <- rbind(ch_pt, data.frame(d, cp1=length(rslt_cpts1), cp2=length(rslt_cpts2),
                                   cp3=length(rslt2_cpts1), cp4=length(rslt2_cpts2),
                                   cp5=length(rslt3_cpts1), cp6=length(rslt3_cpts2)))
}
head(ch_pt) # 변화 횟수 변수화

temp <- data.frame(ch_pt)
temp$exp_no = unlist(lapply(ch_pt$d, id_f_exp_no))
temp$id = unlist(lapply(ch_pt$d, id_f_id))
temp$activity = unlist(lapply(ch_pt$d, id_f_activity))

head(temp)
ch_pt <- temp

# peak와 change 관련 테이블 merge
Peak_final3 <- merge(Peak_final2, ch_pt, by=c('d', 'id','exp_no', 'activity'))

# 필요없는 변수 제거 
combined <- Peak_final3 %>% select(-d, -exp_no, -id)
colnames(combined) # 피크 통계, 파고율, 체크포인트
head(combined)

### 모델 
m <- RF(as.factor(activity)~., data=final_combined)
e <- evaluate_Weka_classifier(m, numFolds = 10, complexity = TRUE, class=TRUE)
e

# 연습문제 
combined_2 <- merge(Peak_final3, HAR_summary_extend, by=c('id', 'exp_no', 'activity'))
final_combined <- merge(combined_2, fft_total, by=c('id', 'exp_no', 'activity'))

## 쵳ㅊ쵳초칯 종 -> # 피크 통계, 파고율, 체크포인트, 푸리에 
summary(final_combined)
colnames(final_combined)

RF <- make_Weka_classifier('weka/classifiers/trees/RandomForest')
Bayes_net <- make_Weka_classifier('weka/classifiers/bayes/BayesNet')

# 피크 관련 통계량 
mag <- function(df, column) {
  df[,str_c('mag', column)] <- 
    with(df, sqrt(get(str_c(column, '.x'))^2 + get(str_c(column, '.y'))^2+ get(str_c(column, '.z'))^2))
  return(df)
}

d = getwd()
print(d)
fls = dir(d, recursive = T)

sub_info = fread('../data_subjects_info.csv')

# Data loading & 전체 데이터 만들기 
myfread = function(f) {
  dt = fread(f, drop=1)
  #dt$fname = f
  dt$exp_no = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[1]
  id = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[2]
  dt$id = id
  dt$activity = unlist(str_split(f, '\\\\_'))[1]
  # 가속도, 회전에 대해 magnitude 변수 만들기 
  dt <- mag(dt, 'userAcceleration')
  dt <- mag(dt, 'rotationRate')
  # info = sub_info[sub_info$code==id]
  # bmi = info$weight/((info$height/100)^2)
  # dt$bmiAcc = dt$maguserAcceleration*bmi
  # dt$bmirR = dt$magrotationRate*bmi
  dt
}

mylist = lapply(fls, myfread)
mydata = rbindlist(mylist)
HAR_total = rbindlist(mylist)

# skewness 함수 
skewness <- function(x) {
  (sum((x-mean(x))^3/length(x))/((sum(x-mean(x))^2)/length(x)))^(3/2)
}
# rms 
rss <- function(x) rms(x) * (length(x))^0.5

# entropy
myEntropy = function(x) {
  Entropy(x, base=length(x))
}
# diffSUM
diffSUM = function(x) {
  sum(diff(x))
}
# diffMean
diffMean = function(x) {
  mean(diff(x))
}
# diffmin
diffMin = function(x) {
  min(diff(x))
}
# diffmax
diffMax = function(x) {
  max(diff(x))
}
# diffEntropy
diffEntropy = function(x) {
  Entropy(diff(x)/sum(diff(x)), base=length(x))
}
HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
  summarize_at(.vars = c('maguserAcceleration', 'magrotationRate','attitude.roll', 'attitude.pitch', 'attitude.yaw'), 
               .funs = c(mean, min, max, sd, skewness, rms, rss, IQR, e1071::kurtosis, Entropy))

HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
  summarize_at(.vars = colnames(HAR_total)[-grep('id|exp_no|activity|mag|attitude', colnames(HAR_total))], 
               .funs = c(mean, min, max, sd, rms, rss, IQR, e1071::kurtosis))

# skewness는 NA가 많이 나오는듯? raw데이터 넣으면 인스턴스 수가 적어짐

### 모델 
m <- RF(as.factor(activity)~., data=HAR_summary_extend[,-grep('id|exp_no', colnames(HAR_summary_extend))])
e <- evaluate_Weka_classifier(m, numFolds = 10, complexity = TRUE, class=TRUE)
e

RF <- make_Weka_classifier('weka/classifiers/trees/RandomForest')
Bayes_net <- make_Weka_classifier('weka/classifiers/bayes/BayesNet')

# 피크 관련 통계량 
mag <- function(df, column) {
  df[,str_c('mag', column)] <- 
    with(df, sqrt(get(str_c(column, '.x'))^2 + get(str_c(column, '.y'))^2+ get(str_c(column, '.z'))^2))
  return(df)
}

d = getwd()
print(d)
fls = dir(d, recursive = T)

sub_info = fread('../data_subjects_info.csv')

# Data loading & 전체 데이터 만들기 
myfread = function(f) {
  dt = fread(f, drop=1)
  #dt$fname = f
  dt$exp_no = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[1]
  id = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[2]
  dt$id = id
  dt$activity = unlist(str_split(f, '\\\\_'))[1]
  # 가속도, 회전에 대해 magnitude 변수 만들기 
  dt <- mag(dt, 'userAcceleration')
  dt <- mag(dt, 'rotationRate')
  # info = sub_info[sub_info$code==id]
  # bmi = info$weight/((info$height/100)^2)
  # dt$bmiAcc = dt$maguserAcceleration*bmi
  # dt$bmirR = dt$magrotationRate*bmi
  dt
}

mylist = lapply(fls, myfread)
mydata = rbindlist(mylist)
HAR_total = rbindlist(mylist)

# skewness 함수 
skewness <- function(x) {
  (sum((x-mean(x))^3/length(x))/((sum(x-mean(x))^2)/length(x)))^(3/2)
}
# rms 
rss <- function(x) rms(x) * (length(x))^0.5

# entropy
myEntropy = function(x) {
  Entropy(x, base=length(x))
}
# diffSUM
diffSUM = function(x) {
  sum(diff(x))
}
# diffMean
diffMean = function(x) {
  mean(diff(x))
}
# diffmin
diffMin = function(x) {
  min(diff(x))
}
# diffmax
diffMax = function(x) {
  max(diff(x))
}
# diffEntropy
diffEntropy = function(x) {
  Entropy(diff(x)/sum(diff(x)), base=length(x))
}
HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
  summarize_at(.vars = c('maguserAcceleration', 'magrotationRate'), 
               .funs = c(mean, min, max, sd, skewness, rms, rss, IQR, e1071::kurtosis))

HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
  summarize_at(.vars = colnames(HAR_total)[-grep('id|exp_no|activity|mag|attitude', colnames(HAR_total))], 
               .funs = c(mean, min, max, sd, rms, rss, IQR, e1071::kurtosis))

# skewness는 NA가 많이 나오는듯? raw데이터 넣으면 인스턴스 수가 적어짐

### 모델 
m <- RF(as.factor(activity)~., data=HAR_summary_extend[,-grep('id|exp_no', colnames(HAR_summary_extend))])
e <- evaluate_Weka_classifier(m, numFolds = 10, complexity = TRUE, class=TRUE)
e