#!/usr/bin/python
"""
skeleton code for k-means clustering mini-project
"""
import pickle
import numpy
import matplotlib.pyplot as plt
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"):
""" some plotting code designed to help you visualize your clusters """
### plot each cluster with a different color--add more colors for
### drawing more than 4 clusters
colors = ["b", "c", "k", "m", "g"]
for ii, pp in enumerate(pred):
plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])
### if you like, place red stars over points that are POIs (just for funsies)
if mark_poi:
for ii, pp in enumerate(pred):
if poi[ii]:
plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
plt.xlabel(f1_name)
plt.ylabel(f2_name)
plt.savefig(name)
plt.show()
### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)
### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
poi = "poi"
features_list = [poi, feature_1, feature_2]
data = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data )
### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, line below assumes 2 features)
for f1, f2 in finance_features:
plt.scatter( f1, f2 )
plt.show()
from sklearn.cluster import KMeans
features_list = ["poi", feature_1, feature_2]
data2 = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data2 )
clf = KMeans(n_clusters=2)
pred = clf.fit_predict( finance_features )
Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name=feature_1, f2_name=feature_2)
### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
try:
Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
print "no predictions object named pred found, no clusters to plot"
What is Snappy? - 구글에서 자체 개발한 압축 라이브러리 - 라이브러리 주소 http://code.google.com/p/snappy - 2015.01.05 현재 최신 버전은 snappy-1.1.1(?) 설치 과정 1. tar 파일 다운로드 https://code.google.com/p/snappy/downloads/list?can=1&q= 2. tar 파일 압축 풀기 3. root 계정으로 snappy 설치 1) 개인적으로 설치한 fedora에 gcc컴파일러, g++ 컴파일러가 없어 함께 설치하였다. g++ yum install gcc-c++ gcc yum -y install gcc 2) snappy 폴더 이동 3) ./configure --enable-shared 4) make 5) sudo make install 4. snappy native library를 하둡에 복사한다. 1) cp /usr/local/bin/libsnappy.* $하둡홈/lib/native/Linux-amd64-64 2) cp /usr/local/bin/libsnappy*.* $하둡홈/lib/native/Linux-i386-32 가장 중요한 사실!!!! - mac에서는 snappy 설치시 .so파일이 아닌 .dylib파일이 생성된다. - native library는 Cygwin, Mac OS X 환경에서 동작하지 않는다.( Native Libaries 참조 ) Sample Source @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(SequenceFileCreator.class); conf.setJobName("SequenceFileCreator"); conf.setMapperClass(Dis...
댓글
댓글 쓰기