setwd('/Users/dayeong/DeskTop/대학/22-1/비정형데이터분석/data/A_DeviceMotion_data')
library(stringr)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(seewave)
library(signal)
library(pracma)
library(fBasics)
library(rJava)
library(RWeka)
library(changepoint)
Sys.setenv(JAVA_HOME = '/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home')
RF <- make_Weka_classifier('weka/classifiers/trees/RandomForest')
Bayes_net <- make_Weka_classifier('weka/classifiers/bayes/BayesNet')
# Data loading
for(f in fls) {
a <- file.path(str_c(d, '/',f))
temp <- read.csv(a)
assign(f, temp)
}
# 전체 데이터 만들기
HAR_total <- data.frame()
for(f in fls) {
temp <- get(f)
HAR_total <- rbind(
HAR_total, temp %>% mutate(exp_no = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[1],
id=unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[2],
activity=unlist(str_split(f, '\\\\_'))[1]))
}
# 가속도, 회전에 대해 magnitude 변수 만들기
HAR_total <- mag(HAR_total, 'userAcceleration')
HAR_total <- mag(HAR_total, 'rotationRate')
# skewness 함수
skewness <- function(x) {
(sum((x-mean(x))^3/length(x))/((sum(x-mean(x))^2)/length(x)))^(3/2)
}
# rms
rss <- function(x) rms(x) * (length(x))^0.5
HAR_summary_extend <- HAR_total %>% group_by(id, exp_no, activity) %>%
summarize_at(.vars = c('maguserAcceleration', 'magrotationRate'),
.funs = c(mean, min, max, sd, skewness, rms, rss, IQR, e1071::kurtosis, Entropy))
# 7. 피크분석 - 피크 관련 통계량
Peak_rslt <- data.frame()
# 원본 데이터에 센서 크기 변수를 추가하기
for (d in fls) {
f <- get(d)
f <- mag(f, 'rotationRate')
f <- mag(f, 'userAcceleration')
assign(d,f)
}
for(d in fls) {
f <- get(d) # 객체로 인식하게 함
p <- findpeaks(f$magrotationRate, threshold=4) # 자이로 센서 중 피크 4 이상의 값을 피크로 도출
Peak_rslt <- rbind(Peak_rslt, data.frame(d, # fls에 있는 파일별로 데이터 프레을 생성한 후, 행에 추가
f_n = ifelse(!is.null(p), dim(p)[1],0), # 피크 도출 결과 p가 null이 아니면 dim(p)[1]: 행의 개수 추출
p_interval = ifelse(!is.null(p), ifelse(dim(p)[1]>1, mean(diff(sort(p[,2]))),0),0), # 행이 두개 이상이면 diff 함수로 행의 차이를 구함
p_interval_std = ifelse(!is.null(p), ifelse(dim(p)[1]>2, std(diff(sort(p[,2]))),0),0), # 피크 간격의 표준편차
p_mean = ifelse(!is.null(p), mean(p[,1]),0),
p_max = ifelse(!is.null(p), max(p[,1]),0),
p_min = ifelse(!is.null(p), min(p[,1]),0),
p_std = ifelse(!is.null(p), std(p[,1]),0)))
}
# 전체 파고율 구하기
temp <- data.frame()
for(d in fls) {
f <- get(d) # d에 담긴 객체 반환
f <- f %>% select(magrotationRate, maguserAcceleration) # 자이로, 가속도 센서
cfR <- crest(f$magrotationRate, 50, plot=TRUE) # Hz: 1초에 50번을 찍어서 기록한 데이터
cfA <- crest(f$maguserAcceleration, 50, plot=TRUE)
temp <- rbind(temp, data.frame(d, cfR=cfR$C, cfA=cfA$C)) # C > 파고율 key value를 뽑아 행으로 추가
}
# temp data.frame <- {d(파일 이름), cfR key, cfA key 값}
# merge(피크 통계, 파고율)
Peak_final <- merge(Peak_rslt, temp, by='d') # 'd'를 기준으로 조인
# 추가 변수 생성
# id_f 적용부분을 for 문이 아닌 apply 함수로 작성하기
id_f_exp_no <- function(x) {
exp_no=unlist(regmatches(x, gregexpr('[[:digit:]]+', x)[1]))[1]
exp_no
}
id_f_id <- function(x) {
id=unlist(regmatches(x, gregexpr('[[:digit:]]+',x)[1]))[2]
id
}
id_f_activity <- function(x) {
activity=unlist(str_split(x, '\\\\_'))[1]
activity
}
# 피크 + 파고율
Peak_final2 <- data.frame(Peak_final)
Peak_final2$exp_no = unlist(lapply(Peak_final$d, id_f_exp_no))
Peak_final2$id = unlist(lapply(Peak_final$d, id_f_id))
Peak_final2$activity = unlist(lapply(Peak_final$d, id_f_activity))
# 변화 분석 결과로부터 변수화하기
# > 변화 횟수 변수화 전체코드
ch_pt <- data.frame()
for(d in fls) {
f <- get(d)
f <- mag(f, 'rotationRate') # 가속도, 자이로 -> 센서의 크기를 하나의 값으로 이용
f <- mag(f, 'userAcceleration')
# 평균의 변화를 감지하도록 두개 변수를 선택한 후에 각각에 대해서 'cpt.mean 적용'
rslt <- sapply(f %>% select(magrotationRate, maguserAcceleration), cpt.mean)
# 결과 rslt에서 '변화시점' 도출하기
rslt_cpts1 <- cpts(rslt$magrotationRate)
rslt_cpts2 <- cpts(rslt$maguserAcceleration)
# cpt.var, cpt.meanvar 적용
rslt2 <- sapply(f %>% select(magrotationRate, maguserAcceleration), cpt.var)
rslt2_cpts1 <- cpts(rslt2$magrotationRate)
rslt2_cpts2 <- cpts(rslt2$maguserAcceleration)
rslt3 <- sapply(f %>% select(magrotationRate, maguserAcceleration), cpt.meanvar)
# cpt.var, cptmeanvar에서 변화시점 도출
rslt3_cpts1 <- cpts(rslt3$magrotationRate)
rslt3_cpts2 <- cpts(rslt3$maguserAcceleration)
# 변화 발생 건수를 도출화
ch_pt <- rbind(ch_pt, data.frame(d, cp1=length(rslt_cpts1), cp2=length(rslt_cpts2),
cp3=length(rslt2_cpts1), cp4=length(rslt2_cpts2),
cp5=length(rslt3_cpts1), cp6=length(rslt3_cpts2)))
}
head(ch_pt) # 변화 횟수 변수화
temp <- data.frame(ch_pt)
temp$exp_no = unlist(lapply(ch_pt$d, id_f_exp_no))
temp$id = unlist(lapply(ch_pt$d, id_f_id))
temp$activity = unlist(lapply(ch_pt$d, id_f_activity))
head(temp)
ch_pt <- temp
# peak와 change 관련 테이블 merge
Peak_final3 <- merge(Peak_final2, ch_pt, by=c('d', 'id','exp_no', 'activity'))
# 필요없는 변수 제거
combined <- Peak_final3 %>% select(-d, -exp_no, -id)
colnames(combined) # 피크 통계, 파고율, 체크포인트
head(combined)
### 모델
m <- RF(as.factor(activity)~., data=final_combined)
e <- evaluate_Weka_classifier(m, numFolds = 10, complexity = TRUE, class=TRUE)
e
# 연습문제
combined_2 <- merge(Peak_final3, HAR_summary_extend, by=c('id', 'exp_no', 'activity'))
final_combined <- merge(combined_2, fft_total, by=c('id', 'exp_no', 'activity'))
## 쵳ㅊ쵳초칯 종 -> # 피크 통계, 파고율, 체크포인트, 푸리에
summary(final_combined)
colnames(final_combined)
RF <- make_Weka_classifier('weka/classifiers/trees/RandomForest')
Bayes_net <- make_Weka_classifier('weka/classifiers/bayes/BayesNet')
# 피크 관련 통계량
mag <- function(df, column) {
df[,str_c('mag', column)] <-
with(df, sqrt(get(str_c(column, '.x'))^2 + get(str_c(column, '.y'))^2+ get(str_c(column, '.z'))^2))
return(df)
}
d = getwd()
print(d)
fls = dir(d, recursive = T)
sub_info = fread('../data_subjects_info.csv')
# Data loading & 전체 데이터 만들기
myfread = function(f) {
dt = fread(f, drop=1)
#dt$fname = f
dt$exp_no = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[1]
id = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[2]
dt$id = id
dt$activity = unlist(str_split(f, '\\\\_'))[1]
# 가속도, 회전에 대해 magnitude 변수 만들기
dt <- mag(dt, 'userAcceleration')
dt <- mag(dt, 'rotationRate')
# info = sub_info[sub_info$code==id]
# bmi = info$weight/((info$height/100)^2)
# dt$bmiAcc = dt$maguserAcceleration*bmi
# dt$bmirR = dt$magrotationRate*bmi
dt
}
mylist = lapply(fls, myfread)
mydata = rbindlist(mylist)
HAR_total = rbindlist(mylist)
# skewness 함수
skewness <- function(x) {
(sum((x-mean(x))^3/length(x))/((sum(x-mean(x))^2)/length(x)))^(3/2)
}
# rms
rss <- function(x) rms(x) * (length(x))^0.5
# entropy
myEntropy = function(x) {
Entropy(x, base=length(x))
}
# diffSUM
diffSUM = function(x) {
sum(diff(x))
}
# diffMean
diffMean = function(x) {
mean(diff(x))
}
# diffmin
diffMin = function(x) {
min(diff(x))
}
# diffmax
diffMax = function(x) {
max(diff(x))
}
# diffEntropy
diffEntropy = function(x) {
Entropy(diff(x)/sum(diff(x)), base=length(x))
}
HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
summarize_at(.vars = c('maguserAcceleration', 'magrotationRate','attitude.roll', 'attitude.pitch', 'attitude.yaw'),
.funs = c(mean, min, max, sd, skewness, rms, rss, IQR, e1071::kurtosis, Entropy))
HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
summarize_at(.vars = colnames(HAR_total)[-grep('id|exp_no|activity|mag|attitude', colnames(HAR_total))],
.funs = c(mean, min, max, sd, rms, rss, IQR, e1071::kurtosis))
# skewness는 NA가 많이 나오는듯? raw데이터 넣으면 인스턴스 수가 적어짐
### 모델
m <- RF(as.factor(activity)~., data=HAR_summary_extend[,-grep('id|exp_no', colnames(HAR_summary_extend))])
e <- evaluate_Weka_classifier(m, numFolds = 10, complexity = TRUE, class=TRUE)
e
RF <- make_Weka_classifier('weka/classifiers/trees/RandomForest')
Bayes_net <- make_Weka_classifier('weka/classifiers/bayes/BayesNet')
# 피크 관련 통계량
mag <- function(df, column) {
df[,str_c('mag', column)] <-
with(df, sqrt(get(str_c(column, '.x'))^2 + get(str_c(column, '.y'))^2+ get(str_c(column, '.z'))^2))
return(df)
}
d = getwd()
print(d)
fls = dir(d, recursive = T)
sub_info = fread('../data_subjects_info.csv')
# Data loading & 전체 데이터 만들기
myfread = function(f) {
dt = fread(f, drop=1)
#dt$fname = f
dt$exp_no = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[1]
id = unlist(regmatches(f, gregexpr('[[:digit:]]+', f)[1]))[2]
dt$id = id
dt$activity = unlist(str_split(f, '\\\\_'))[1]
# 가속도, 회전에 대해 magnitude 변수 만들기
dt <- mag(dt, 'userAcceleration')
dt <- mag(dt, 'rotationRate')
# info = sub_info[sub_info$code==id]
# bmi = info$weight/((info$height/100)^2)
# dt$bmiAcc = dt$maguserAcceleration*bmi
# dt$bmirR = dt$magrotationRate*bmi
dt
}
mylist = lapply(fls, myfread)
mydata = rbindlist(mylist)
HAR_total = rbindlist(mylist)
# skewness 함수
skewness <- function(x) {
(sum((x-mean(x))^3/length(x))/((sum(x-mean(x))^2)/length(x)))^(3/2)
}
# rms
rss <- function(x) rms(x) * (length(x))^0.5
# entropy
myEntropy = function(x) {
Entropy(x, base=length(x))
}
# diffSUM
diffSUM = function(x) {
sum(diff(x))
}
# diffMean
diffMean = function(x) {
mean(diff(x))
}
# diffmin
diffMin = function(x) {
min(diff(x))
}
# diffmax
diffMax = function(x) {
max(diff(x))
}
# diffEntropy
diffEntropy = function(x) {
Entropy(diff(x)/sum(diff(x)), base=length(x))
}
HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
summarize_at(.vars = c('maguserAcceleration', 'magrotationRate'),
.funs = c(mean, min, max, sd, skewness, rms, rss, IQR, e1071::kurtosis))
HAR_summary_extend <- data.frame(HAR_total) %>% group_by(id, exp_no, activity) %>%
summarize_at(.vars = colnames(HAR_total)[-grep('id|exp_no|activity|mag|attitude', colnames(HAR_total))],
.funs = c(mean, min, max, sd, rms, rss, IQR, e1071::kurtosis))
# skewness는 NA가 많이 나오는듯? raw데이터 넣으면 인스턴스 수가 적어짐
### 모델
m <- RF(as.factor(activity)~., data=HAR_summary_extend[,-grep('id|exp_no', colnames(HAR_summary_extend))])
e <- evaluate_Weka_classifier(m, numFolds = 10, complexity = TRUE, class=TRUE)
e