In [97]:
from pylab import *
import pandas as pd
import pandas.tools.plotting as pt
from netCDF4 import Dataset
import datetime as dt
from sklearn import ensemble as ens
from sklearn import linear_model as lin
from sklearn.pipeline import Pipeline
from sklearn import preprocessing as prep
In [98]:
nino34 = pd.read_csv("http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt", sep="\s+")
In [99]:
link = pd.read_csv("https://raw.githubusercontent.com/johncarlosbaez/el-nino/master/R/average-link-strength.txt", header=None)
link.shape
Out[99]:
(1132, 1)
In [268]:
url="http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface/air.mon.mean.nc"
#url="http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface/air.sig995.mon.mean.nc"
nc=Dataset(url)
#dir(nc)
In [269]:
vars=nc.variables
t = vars['time']
day0 = dt.date(1800,1,1)
days=amap(lambda x: day0 + dt.timedelta(hours=x), t[:])
In [270]:
air0=vars["air"][:,:,:]
print air0.shape
(802, 73, 144)

In [271]:
#print days
X, Y, Z = air0.shape
air = air0.reshape(X, Y*Z)[days >= dt.date(1949,07,01)][:nino34.shape[0]]
print air.shape
(778, 10512)

In [272]:
xt = ens.ExtraTreesRegressor(n_estimators=2000, max_features=0.5, max_depth=None, min_samples_leaf=1,
                         oob_score=False, bootstrap=False, n_jobs=3, verbose=0)

sgd =  Pipeline([
    ("scale", prep.StandardScaler()),
    ("norm", prep.Normalizer()),
    ("sgd",lin.SGDRegressor(loss="epsilon_insensitive", penalty='elasticnet', n_iter=1000, alpha= (2**-20), epsilon=1, 
                            l1_ratio=0.9,
                        #learning_rate="optimal",
                    shuffle=True, verbose=3)),
    ])
    
model=xt

train = air[0:400]
print train.shape
model.fit(X=train, y=nino34.iloc[0:400,-1])
(400, 10512)

Out[272]:
ExtraTreesRegressor(bootstrap=True, compute_importances=None, criterion='mse',
          max_depth=None, max_features=0.5, max_leaf_nodes=None,
          min_density=None, min_samples_leaf=1, min_samples_split=2,
          n_estimators=2000, n_jobs=3, oob_score=True, random_state=None,
          verbose=0)
In [281]:
print model.score(X=train, y=nino34.iloc[0:400,-1]), model.score(air[400:788,:], nino34.iloc[400:788, -1])
0.928413218247 0.235866144462

In [296]:
test = pd.DataFrame({"true": nino34.iloc[400:788, -1], "predicted": 2*model.predict(air[400:788,:])})
test.plot("true", "predicted", kind="scatter", alpha=0.2, marker=".", figsize=(7,7))
Out[296]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f364c7c69d0>
In [297]:
test.plot(alpha=0.2, marker=".", figsize=(20,3))
Out[297]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f364c6b1150>
In [298]:
figure(figsize=(20,3))
zzz=xcorr(test.predicted, test.true, maxlags=None)
In [299]:
figure(figsize=(20,3))
zzz=acorr(test.true, maxlags=None)
In [300]:
figure(figsize=(20,3))
zzz=acorr(test.predicted, maxlags=None)
In [301]:
future=pd.DataFrame({"future": model.predict(air[-6:,:])})
In [302]:
future.plot()
Out[302]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f364c2ca390>
In [285]:
figure(figsize=(20,10))
imshow(air[0,:].reshape((Y,Z)))
Out[285]:
<matplotlib.image.AxesImage at 0x7f364bf5d6d0>
In [306]:
figure(figsize=(20,10))
land = Dataset("http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis/surface/land.nc").variables["land"][0,:,:]
imshow(model.feature_importances_.reshape((Y,Z))-0.0003*land)
Out[306]:
<matplotlib.image.AxesImage at 0x7f364bf39f50>
In []: