%data analysis

%loading data: 24*3 array, hourly traffic counts(row) at 3 different intersections over 24 hours
load count.dat

%missing data: at 3rd intersection
c3 = count(:,3); %data at intersection 3
c3NanCount = sum(isnan(c3)) % isnan is a function

%outliers
bin_counts = hist(c3); %histogram bin counts
N = max(bin_counts);% maximum bin count
mu3 = mean(c3); %data mean
sigma3= std(c3); % data standard deviation
hist(c3); % plot histogram
hold on
plot([mu3 mu3], [0 N],'r','LineWidth',2); % plot mean
hold on
plot([mu3+2*sigma3, mu3+2*sigma3], [0 N],'g'); % plot 2 std
hold off;
%replace data with more than two std
outlier = (c3-mu3)>2*sigma3;
c3(outlier) = NaN;%[] % reassign NaN values to outlier

%smoothing and filtering
% plot of time-series of the third intersection
plot(c3,'o-')
hold on
%pay attention to NaN value in plot
%apply a simple moving average smoother:
span = 3; %size of the averaging window
window = ones(span,1)/span;
smoothed_c3 = convn(c3, window,'same'); %conv returns the central part of the convolution
h = plot(smoothed_c3,'ro-');hold on;
legend('Data','Smoothed Data');

%filter function used for smoothing data:
smoothed2_c3 = filter(window, 1, c3);
plot(smoothed2_c3,'mo-');% filter returns the initial part of the convolution

%--------------------------------------------------------
%summarizing data
x1 = mean(count)
x2 = median(count)
x3 = mode(count)

%measuring scale
dx1 = max(count) - min(count)
dx2 = std(count)
dx3 = var(count)

%shape of distribution
figure
hist(count)
legend('Intersection 1','Intersection 2','Intersection 3')

%modelling the distribution shape-choose exponential distribution, with
%parameter mu as data mean
c1 = count(:,1); %data at intersection 1
[bin_counts, bin_location] = hist(c1);
bin_width = bin_location(2) - bin_location(1);
hist_area = bin_width* sum(bin_counts);
figure
hist(c1) 
hold on
mu1 = mean(c1)
exp_pdf = @(t) (1/mu1)*exp(-t/mu1); %define a function here, without using another m-file
t = 0:150;
y = exp_pdf(t);
plot(t, (hist_area )* y,'r','LineWidth',2);
legend('Distribution','Exponential fit')

%-----------------------------------------------------------------
%visualizing data
%2D scatter plots
c1 = count(:,1); % data at intersection 1 
c2 = count(:,2); % data at intersection 2
figure
scatter(c1,c2,'filled')
xlabel('Intersection 1');
ylabel('Intersection 2');
C12 = cov([c1 c2]) % covariance-measure the strength of linear relation of the two variables
R12 = corrcoef([c1 c2]); % standardize the value of covariance
r12 = R12(1,2) % Correlation coefficient

%3D scatter plots: relationship between traffic volume at all 3
%intersections
figure
scatter3(c1,c2,c3,'filled')
xlabel('Intersection 1');
ylabel('Intersection 2');
zlabel('Intersection 3');
%eig of their cov matrix tells the strength of the linear relationship
%among the variables in 3D scatter

%scatter plot array
figure
plotmatrix(count) % make comparison of the relationship between multiple pairs of intersection

%exploring data in graph, e.g.
scatter(count(:,1), count(:,3)) %select the Data Cursor Tool and click on the data pt

%----------------------------------------------------------------
%modelling data
%polynomial regression: polyfit-estimate coefficient, polyval-evaluate the
%model at arbitrary values
%e.g.
c3 = count(:,3);
tdata = (1:24)';
p_coeffs = polyfit(tdata, c3, 6); 
figure
plot(c3,'o-'); hold on;
tfit = (1:0.01:24)';
yfit = polyval(p_coeffs, tfit);
plot(tfit, yfit, 'r-','LineWidth',2);
legend('Data','Polynomial fit','Location','NW');