multi_pgm_3강.txt

# 3강 주성분석

# R 2.2

install.packages("HSAUR2")
library(HSAUR2)
data(heptathlon)
head(heptathlon)

summary(heptathlon)
write.csv(heptathlon, file="c:/data/mva/heptathlon.csv")

# R 2.3

heptathlon$hurdles = max(heptathlon$hurdles) - heptathlon$hurdles
heptathlon$run200m = max(heptathlon$run200m) - heptathlon$run200m
heptathlon$run800m = max(heptathlon$run800m) - heptathlon$run800m
head(heptathlon

# R 2.4

hep_data = heptathlon[, -8]
hep_pca = princomp(hep_data, cor=T, scores=T)
names(hep_pca)
hep_pca

# R 2.5

summary(hep_pca)
eig_val = hep_pca$sdev^2
eig_val

# R 2.6

screeplot(hep_pca, type="lines", pch=19, main="Scree plot")
hep_var = hep_pca$sdev^2
hep_var
hep_var_ratio = hep_var/sum(hep_var)
round(hep_var_ratio, 3)

plot(cumsum(hep_var_ratio), type='b', pch=19, xlab='Component',
ylab='Cumulative Proportion')
title('Variance Explained')
# 주성분계수
hep_pca$loadings[, c(1:2)]

# R 2.7
hep_pca$scores[, c(1:2)]
biplot(hep_pca, cex=0.7, col=c("Red", "Blue"))
title("Biplot")

# R 2.8
beer = read.csv("c:/data/mva/beer.csv")
head(beer)

summary(beer)

# R 2.9
round(cor(beer), 2)
plot(beer, pch=19)

# R 2.10
beer_pca = princomp(beer, cor=F, scores=T)
beer_pca

# R 2.11
summary(beer_pca)

# R 2.12
screeplot(beer_pca, type="lines", pch=19)
beer_pca$loadings[, c(1:3)]

# Py 2.1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 데이터 읽기
heptathlon = pd.read_csv("c:/data/mva/heptathlon.csv")
heptathlon.head(3)

# 변수이름 확인하기
heptathlon.columns
# 기술통계량 구하기 – 소수점 이하 2자리 반올림 표시
round(heptathlon.describe(), 2)

#Py 2.2
# 변환: 변수최댓값 - 변숫값
heptathlon.hurdles = np.max(heptathlon.hurdles) - heptathlon.hurdles
heptathlon.run200m = np.max(heptathlon.run200m) - heptathlon.run200m
heptathlon.run800m = np.max(heptathlon.run800m) - heptathlon.run800m
heptathlon.head()

# 분석변수 선택하기
feature = ['hurdles','highjump','shot','run200m','longjump','javelin','run800m']
hep_data = heptathlon[feature]
# hep_data = heptathlon.iloc{:, 1:8]
# hep_data = heptathlon.iloc{:, 1:-1]

# 변수 표준화
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(hep_data)

# Py 2.3
# 초기 주성분분석
from sklearn.decomposition import PCA
pca_init = PCA(n_components=len(hep_data.columns))
pca_init.fit(x)
pca_init.explained_variance_

# 스크리 그림 그리기
plt.title('Scree Plot')
plt.xlabel('Components')
plt.ylabel('Explained Variance')
plt.plot(pca_init.explained_variance_, 'o-')
plt.show()

# Py 2.4
# 주성분분석 – 주성분 수 2개 추출
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
hep_pca = pca.fit_transform(x)
# dir(pca)
# 주성분분산
pca.explained_variance_

# 주성분분산 비율
pca.explained_variance_ratio_

# 주성분계수
np.round(pca.components_, 3)
# 주성분점수
hep_pca[0:5,:]

# Py 2.5
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 데이터 읽기
beer = pd.read_csv("c:/data/mva/beer.csv")
beer.head()

# 기술통계량 구하기
beer.describe()

# Py 2.6
# 주성분분석 – 주성분 수 3으로 함
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_beer = pca.fit_transform(beer)

# 주성분분산
pca.explained_variance_

# 주성분 표준편차
np.sqrt(pca.explained_variance_)

# 주성분분산 비율
pca.explained_variance_ratio_

# 주성분계수
np.round(pca.components_, 3)