##################################################
### ST906 - Financial Time Series (2019/2020) ####
##################################################

### Introduction #################################

# Some useful references

#https://nwfsc-timeseries.github.io/atsa-labs/chap-ts.html
#https://cran.r-project.org/web/packages/zoo/vignettes/zoo-read.pdf
#https://www.stat.pitt.edu/stoffer/tsa4/  ##this is the main reference with book


#install.packages("pacman")   #run this only the first time, after that just call the library
library(pacman)
p_load(datasets, zoo, FinTS, astsa, TTR, xts, 
       tsa, quantmod, ggplot2)
Sys.setlocale("LC_ALL","English")


?nyse
nyse
str(nyse)   
class(nyse) # Time-Series or ts
plot(nyse)



data(AirPassengers)
AirPassengers
class(AirPassengers)  # ts: it is printed as a matrix, but stored as a vector
ts.plot(AirPassengers)
plot(log(AirPassengers)) 




data(sp500w)
?sp500w
head(sp500w)
plot(sp500w, lwd = 1)
class(sp500w)  # xts object, with rownames equal to the date. It provides a nice visualisation.
y <- ts(sp500w, start=2003, freq=52) # this is now ts
ts.plot(y)
autoplot.zoo(sp500w) # from ggplot2 for nice visualisation

#http://www.sthda.com/english/articles/32-r-graphics-essentials/128-plot-time-series-data-using-ggplot/

### Daily returns(or percent change) of the Dow Jones Industrial Average
head(djia)
?astsa::djia
plot(djia$Close)
matplot(djia[,-5], type = "l")  # plot multiple time series, except the 5th column
legend("bottomright", legend = names(djia)[-5], 
       col = 1:4, lty = 1)



quantmod::getSymbols("^DJI", src = "yahoo") # now in the enviroment you should have DJI
head(DJI)
plot(DJI$DJI.Close)
plot(djia, main = "DJIA Returns", type = "n") # type = "n" gives the background of the plot
lines(djia, col = 1)
plot(djia)




### White noise ##################################

#Gaussian iid white noise

wn <- rnorm(200)
wn <- ts(wn)
ts.plot(wn)
var(wn)
abline(h=mean(wn), col = "orange")

wn_filtered_3 = filter(wn, sides=2, filter=rep(1/3,3)) # moving average
wn_filtered_27 = filter(wn, sides=2, 
                        filter=rep(1/27,27)) # moving average
lines(wn_filtered_3, col = 2)
lines(wn_filtered_27, col = 3)

# The wider the window of the filter, the "smoother" the moving average is.

### Random walk (from TSA4) ######################

set.seed(154) # so you can reproduce the results
w = rnorm(200)
x = cumsum(w) 
wd = w +.2
xd = cumsum(wd)
plot.ts(xd, ylim=c(-5,55), main="random walk", 
        ylab='')
lines(x, col=4)
abline(h=0, col=4, lty=2)
abline(a=0, b=.2, lty=2)

### Stationarity - ACF and PACF ##################

acf(wn) # the autocorrelation function of a white noise shows zero correlation after lag 0.
acf_theory <- ARMAacf(ar = 0, 
                      ma = c(0.7, 0.2), 
                      lag.max = 20)
plot(acf_theory) # theoretical ACF for a MA(2)

# Tip: identification of MA models is often done with the ACF (the last lag at which it is non-zero gives the order)

pacf_theory <- ARMAacf(ar = 0.9, 
                       ma = 0, 
                       lag.max = 20, 
                       pacf = TRUE)
plot(1:20, pacf_theory) # theoretical PACF for an AR(1)

# Tip: identification of AR models is often done with the PACF (the last lag at which it is non-zero gives the order)

sp500w_stat <- sp500w[seq_len(52*5)] # just to select a "statinary-looking" chunk of data
plot(sp500w_stat, main = "SP500W")
acf(sp500w_stat)  # what do you get from here?
pacf(sp500w_stat)

### Backward operator and difference operator (from TSA4)####

?lag
acf_sp500w = round(acf(sp500w_stat, 10, plot=FALSE)$acf[-1], 3)# first 6 sample acf values
par(mfrow=c(1,2))
plot(as.numeric(lag(sp500w_stat,-1)), as.numeric(sp500w_stat),
     xlab = "lag(sp500w, -1)", ylab = "sp500w")
legend('topleft', legend=acf_sp500w[1])
plot(as.numeric(lag(sp500w_stat,-6)), as.numeric(sp500w_stat),
     xlab = "lag(sp500w, -6)", ylab = "sp500w")
legend('topleft', legend=acf_sp500w[6])
op(par)
?diff(1:10, 2, 2) # difference operator


### ARMA(1,1) ####################################

op <- par(mfrow=c(1,2))  # split the plotting area in two boxes side by side
ts.sim <- arima.sim(list(order = c(1,0,0), ar = 0.7), n = 200) 
# Remember: ARIMA(p,0,q) = ARMA(p,q)
acf(ts.sim)
pacf(ts.sim)
par(op) # go back to original settings

### Trend in time series #########################

trend_air <- lm(AirPassengers ~ time(AirPassengers)) # fit a linear model to extract trend
summary(trend_air)  # print the coefficients and evaluate significance
plot(AirPassengers)
abline(trend_air, col = 2)


time_air <- time(AirPassengers)
time_air_sq <- time(AirPassengers)^2

trend_air_quad <- lm(AirPassengers ~ time_air + time_air_sq) # add a quadratic term
# you cannot use abline because this is not a straight line
x_time <- seq(min(time(AirPassengers)), max(time(AirPassengers)), length.out = 1000)
y_trend_sq <- predict(trend_air_quad, list(time_air = x_time,
                             time_air_sq = x_time^2))

lines(x_time, y_trend_sq, col=3)

lines(lowess(AirPassengers, f=.05), lwd=2, col=4) # lowess, nonparametric smoothing
lines(lowess(AirPassengers, f=.5), lwd=2, col=5)

# lo(w)ess has one parameter f: the smaller, the wigglier the function - but it will track closely the data
# this is an example of bias-variance tradeoff

x = rnorm(150, mean=5)
?arima
air_ar1 <- arima(AirPassengers, order=c(1,0,0))  # fit an ARIMA model - find the values of the parameters
air_ma1 <- arima(AirPassengers, order=c(0,0,1))
air_arma11 <- arima(AirPassengers, order=c(1,0,1))

AIC_air <- c(air_ar1$aic, air_ma1$aic, air_arma11$aic) # model comparison: the smaller is AIC, the better is the model
names(AIC_air) <- c("AR(1)", "MA(1)", "ARMA(1,1)")
AIC_air

#Useful transformations: differencing, taking logarithms

difflog_air = diff(log(AirPassengers))
diff_difflog_air = diff(log(AirPassengers), 12)
plot.ts(cbind(AirPassengers,
              log(AirPassengers),
              difflog_air,
              diff_difflog_air), main="")
monthplot(difflog_air)
monthplot(diff_difflog_air)

### Seasonality ##################################


set.seed(90210)
x  = 2*cos(2*pi*1:500/50 + .6*pi) + rnorm(500,0,5)
z1 = cos(2*pi*1:500/50)
z2 = sin(2*pi*1:500/50)
summary(fit <- lm(x~0+z1+z2))
plot.ts(x)
plot.ts(x, col=8, ylab=expression(hat(x)))
lines(fitted(fit), col=2)

periodogram(AirPassengers) # Frequency domain analysis

plot(decompose(AirPassengers)) 
plot(stl(AirPassengers, "per"))

# it makes all the decomposition for you. Once you have removed trend and seasonality,
# you have to study the residual time series in terms of ACF.