オープンデータ活用

ライブラリの読込

In [1]:
from ipywidgets import FloatProgress
from IPython.display import display

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import datetime

水位データの読込

ダウンロードしたデータは、先頭から新しいデータが入っています。

In [2]:
filename = "sparql-2017.csv"
df_level = pd.read_csv(filename, header=None, skiprows=1)
In [3]:
df_level.columns = ["url","datetime","level"]
In [4]:
df_level["datetime"] = df_level.datetime.map(lambda _: pd.to_datetime(_))
In [5]:
df_level.index = df_level.pop("datetime")
In [6]:
df_level = df_level.sort_index()
In [7]:
df_level["level"].plot(figsize=(15,5), ylim=(0,250))
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x111964dd8>

降水量データの読込

In [8]:
filename = "data-2017.csv"
df_rain = pd.read_csv(filename,encoding="SHIFT-JIS",skiprows=4)
In [9]:
df_rain.columns = ["datetime", "rain", "現象なし情報","品質情報","均質番号"]
In [10]:
df_rain["datetime"] = df_rain.datetime.map(lambda _: pd.to_datetime(_))
In [11]:
df_rain.index = df_rain.pop("datetime")
In [12]:
plt.figure(figsize=(15,5))
plt.ylim(0,250)
plt.plot(df_level.level)
plt.plot(df_rain.rain * 5)
Out[12]:
[<matplotlib.lines.Line2D at 0x11240b080>]

データ加工

In [13]:
df_rain = df_rain[df_rain.index < df_level.index.max()]
In [14]:
ixs = df_rain.index

df = []
y = []

for i in range(len(ixs)-3):
    
    dt1 = ixs[i + 1]
    dt2 = ixs[i + 2]
    dt3 = ixs[i + 3]
    
    d1 = df_level[dt1:dt2].level.tolist()
    d2 = df_level[dt2:dt3].level.tolist()

    if len(d1) > 10 and len(d2) > 10:
        y.append(max(d2))
        d1.sort()
        d1.reverse()
        d1 = d1[:10]
        d1.append(df_rain.ix[i].rain)
        df.append(d1)
        
df = pd.DataFrame(df)
df["y"] = y

df.shape
Out[14]:
(6862, 12)
In [15]:
df.head()
Out[15]:
0 1 2 3 4 5 6 7 8 9 10 y
0 66 65 65 65 65 65 65 65 64 64 0.0 64
1 64 64 64 64 64 64 64 64 64 64 0.0 64
2 64 64 64 64 63 63 63 63 63 63 0.0 64
3 64 64 64 64 64 64 63 63 63 63 0.0 65
4 65 65 65 64 64 64 64 64 64 64 0.0 65

機械学習

In [16]:
X_cols = df.columns[:-1]
In [17]:
X = df[X_cols].as_matrix().astype("float")
y = df.y.as_matrix().astype("int").flatten()
In [18]:
num = int(len(X) * 0.9)
print(len(X), num, len(X)-num)

X_train = X[:num]
X_test = X[num:]
y_train = y[:num]
y_test = y[num:]
6862 6175 687
In [19]:
# 正規化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train.std()
Out[19]:
28.81180290625171
In [20]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train.std()
Out[20]:
1.0
In [21]:
# モデルの設定

# ランダムフォレスト
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# 勾配ブースティング
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=42)

# ニューラルネットワーク
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(random_state=42)

model
Out[21]:
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
In [22]:
# 学習と予測
model.fit(X_train, y_train)
result = model.predict(X_test)
result.shape
Out[22]:
(687,)
In [23]:
# スコア
print(model.score(X_test,y_test))
0.937992677762
In [24]:
pp = pd.DataFrame({'act': np.array(y_test), "pred": np.array(result), "rain": X_test[:,-1]})
#pp = pd.DataFrame({"pred": np.array(result), "rain": X_test[:,-1]})
pp.rain = pp.rain * 5
plt.figure(figsize=(15,5))
plt.ylim(0,250)
plt.legend = pp.columns
plt.plot(pp)
Out[24]:
[<matplotlib.lines.Line2D at 0x11554a400>,
 <matplotlib.lines.Line2D at 0x11554a5c0>,
 <matplotlib.lines.Line2D at 0x11554a7b8>]
In [25]:
import random

i = random.randint(0,len(df))
d = df.ix[i].as_matrix().tolist()
print(i, d)

df_test = []

for i in range(21):
    temp = d[:10]
    temp.append(i)
    df_test.append(temp)
    
d = scaler.transform(np.array(df_test).astype("float"))

test = model.predict(d)
test.tolist()

#plt.ylim(50,200)
plt.xlim(0,20)
plt.plot(test)
2035 [65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 0.0, 65.0]
Out[25]:
[<matplotlib.lines.Line2D at 0x1160a6c50>]