from pylab import *
import pandas as pd
import pandas.tools.plotting as pt
from netCDF4 import Dataset
import datetime as dt
from sklearn import ensemble as ens
from sklearn import linear_model as lin
from sklearn.pipeline import Pipeline
from sklearn import preprocessing as prep
nino34 = pd.read_csv("http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt", sep="\s+")
link = pd.read_csv("https://raw.githubusercontent.com/johncarlosbaez/el-nino/master/R/average-link-strength.txt", header=None)
link.shape
url="http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface/air.mon.mean.nc"
#url="http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface/air.sig995.mon.mean.nc"
nc=Dataset(url)
#dir(nc)
vars=nc.variables
t = vars['time']
day0 = dt.date(1800,1,1)
days=amap(lambda x: day0 + dt.timedelta(hours=x), t[:])
air0=vars["air"][:,:,:]
print air0.shape
#print days
X, Y, Z = air0.shape
air = air0.reshape(X, Y*Z)[days >= dt.date(1949,07,01)][:nino34.shape[0]]
print air.shape
xt = ens.ExtraTreesRegressor(n_estimators=2000, max_features=0.5, max_depth=None, min_samples_leaf=1,
oob_score=False, bootstrap=False, n_jobs=3, verbose=0)
sgd = Pipeline([
("scale", prep.StandardScaler()),
("norm", prep.Normalizer()),
("sgd",lin.SGDRegressor(loss="epsilon_insensitive", penalty='elasticnet', n_iter=1000, alpha= (2**-20), epsilon=1,
l1_ratio=0.9,
#learning_rate="optimal",
shuffle=True, verbose=3)),
])
model=xt
train = air[0:400]
print train.shape
model.fit(X=train, y=nino34.iloc[0:400,-1])
print model.score(X=train, y=nino34.iloc[0:400,-1]), model.score(air[400:788,:], nino34.iloc[400:788, -1])
test = pd.DataFrame({"true": nino34.iloc[400:788, -1], "predicted": 2*model.predict(air[400:788,:])})
test.plot("true", "predicted", kind="scatter", alpha=0.2, marker=".", figsize=(7,7))
test.plot(alpha=0.2, marker=".", figsize=(20,3))
figure(figsize=(20,3))
zzz=xcorr(test.predicted, test.true, maxlags=None)
figure(figsize=(20,3))
zzz=acorr(test.true, maxlags=None)
figure(figsize=(20,3))
zzz=acorr(test.predicted, maxlags=None)
future=pd.DataFrame({"future": model.predict(air[-6:,:])})
future.plot()
figure(figsize=(20,10))
imshow(air[0,:].reshape((Y,Z)))
figure(figsize=(20,10))
land = Dataset("http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis/surface/land.nc").variables["land"][0,:,:]
imshow(model.feature_importances_.reshape((Y,Z))-0.0003*land)