異常検知入門 with R - プロメモグラム

これはRによる異常検知のものだったがPythonでやる。
変数の並びからどう考えてもおかしい値を抜き出すことが目的。

データセットはcarパッケージのDavisっていう身長体重のデータセットを使った。

Jupyter便利だった。

一変数

# coding: utf-8

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats

weight = pd.read_csv('Davis.csv')['weight'].as_matrix()

# 平均
m = weight.mean()

# 標本分散
s2 = ((weight - m) ** 2).mean()

# 異常度
a = (weight - m) ** 2 / s2

# 閾値
th = sp.stats.chi2.ppf(0.99,1)

#  図示
plt.scatter( np.arange(weight.size), a, color = 'g')
plt.plot([0,200], [th,th] , color='b', linestyle='-', linewidth=2)

f:id:zia_glass:20160728220025p:plain

二変数

# coding: utf-8

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
from numpy import linalg as la

davis = pd.read_csv('Davis.csv').as_matrix()
x = davis[: ,2:4]

plt.scatter( x[:,0], x[:,1])

# 平均ベクトル
mx = x.mean(axis = 0)
mx

# 中心化データ行列
xc = x - mx
xc.shape

# 標本共分散行列
sx = ( xc.T.dot(xc) / x[:,0].size ).astype(float)
sx

# 異常度
ap = np.dot(xc, np.linalg.inv(sx)) * xc 
a = ap[:,0] + ap[:,1] 

# 閾値
th = sp.stats.chi2.ppf(0.99,2)

plt.scatter(np.arange(a.size), a, color='g')
plt.plot([0,200], [th,th] , color='b', linestyle='-', linewidth=2)

f:id:zia_glass:20160728221146p:plain f:id:zia_glass:20160728221125p:plain