머신러닝 파이썬 코드 스니펫
in Archive
copy and paste
File Open
txt file -> 한 줄씩 읽어 list로 저장하기
### for문 이용
lines = []
with open("sample.txt") as f:
for row in f:
lines.append(f.strip())
### readlines() 이용
lines2 = []
with open("sample2.txt") as f:
lines2 = f.readlines() # 여러 줄 읽기
lines2 = [x.strip() for x in lines2] # 줄바꿈 문자 제거
Normalize
최대 1, 최소 0이 되도록 선형 정규화
각 feature 별로 (column 별) 최솟값을 빼고 최댓값으로 나누기
for i in range(X.shape[1]): # feature의 개수만큼 반복 = 13
X[:, i] -= np.min(X[:, i]) # 해당 열의 minimum을 일괄적으로 빼기
X[:, i] /= np.max(X[:, i]) # 해당 열의 max 값으로 전체 열을 나누기
return X
Scikit-learn
from sklearn.model_selection import train_test_split
Regression
Linear Regression
from sklearn.linear_model import LinearRegression
lrmodel = LinearRegression()
lrmodel.fit(train_x, train_y)
beta_0 = lrmodel.coef_[0] # lrmodel로 구한 직선의 기울기
beta_1 = lrmodel.intercept_ # lrmodel로 구한 직선의 y절편
print("beta_0: %f" % beta_0)
print("beta_1: %f" % beta_1)
print("Loss: %f" % loss(X, Y, beta_0, beta_1))
pred = lrmodel.predict(test_x)
socre = lr_mode.score(test_x, test_y)
Ridge/Lasso/ElasticNet Regression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
ridge_reg = Ridge(alpha=10)
lasso_reg = Lasso(alpha=10)
ElasticNet_reg = ElasticNet(alpha=0.001, l1_ratio=0.01)
Regression metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
loss = mean_absolute_error(y, y_pred)
PCA
import sklearn.decomposition
num_components = 3
pca = sklearn.decomposition.PCA(n_components=num_components)
pca.fit(X)
transform_res = pca.transform(X)
Classification
SVM
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
svm = SVC()
svm.fit(train_X, train_y)
pred_y = svm.predict(test_X)
# SVM 분류 결과값을 출력합니다.
print("\nConfusion matrix : \n",confusion_matrix(test_y,pred_y))
print("\nReport : \n",classification_report(test_y,pred_y))
Gaussian NB
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(train_X, train_y)
predicted = model.predict(test_X)
Clustering
K-means
from sklearn.cluster import KMeans
kmeans = KMeans(init="random", n_clusters=3, random_state=100)
kmeans.fit(irisDF)
Gaussian Mixture
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3, random_state=100)
gmm.fit(irisDF)
irisDF['cluster'] = gmm.predict(irisDF)
직접 구현
Loss Function
반복문
def loss(x, y, beta_0, beta_1):
N = len(x)
l = 0 # loss 값
for i in range(N):
l += (y[i] - (beta_0*x[i] + beta_1))**2
return l
numpy 연산
rss = np.sum(np.square(y - y_pred)) # RSS
mse = np.mean(np.square(y - y_pred)) # MSE
K-means Clustering
def kmeans(X, num_clusters, initial_centroid_indices):
import time
N = len(X)
centroids = X[initial_centroid_indices]
labels = np.zeros(N)
while True:
is_changed = False # 라벨이 바뀌었는지
for i in range(N):
distances = []
for centroid in centroids:
distances.append(distance(X[i], centroid))
if labels[i] != np.argmin(distances):
is_changed = True
labels[i] = np.argmin(distances) # 클러스터 0, 1, 2 중 하나
# print(labels)
### 새 중심점 계산
for k in range(num_clusters):
x = X[labels == k][:, 0]
y = X[labels == k][:, 1]
x = np.mean(x)
y = np.mean(y)
centroids[k] = [x, y]
if not is_changed:
break
return labels
### 유클리드 거리 norm
def distance(x1, x2):
return np.sqrt(np.sum((x1 - x2) ** 2))