library(lmtest) library(sandwich) library(car) library(gplots) library(calibrate) library(msm) library(effects) op <- par(no.readonly=TRUE) #set the los for various things bp.test.significance<-0.05 CI.plot.significance<-0.05 #note for ramsey test: H0: model has no omitted variables #note for the BPW test: H0: model is homoskedastic # # Read in and edit the data # mydata <- read.csv("overdata.csv",header=TRUE) mydata$innings2<-mydata$Innings-1 mydata$first15<-ifelse(mydata$Over<=15, 1, 0) mydata$at.bat.lost.toss=abs(mydata$at.bat.won.toss-1) names(mydata) region.europe<-c("England","Ireland","Netherlands","Scotland","Wales") region.subcontinent<-c("Bangladesh","India","Pakistan","Sri Lanka") region.ssafrica<-c("Kenya","South Africa","Zimbabwe") region.windies<-c("West Indies") region.oceania<-c("Australia","New Zealand") region.other<-c("Canada","Malaysia","Morocco","United Arab Emirates") mydata$region[mydata$Country %in% region.europe]<-"Europe" mydata$region[mydata$Country %in% region.subcontinent]<-"Subcontinent" mydata$region[mydata$Country %in% region.ssafrica]<-"SS Africa" mydata$region[mydata$Country %in% region.windies]<-"West Indies" mydata$region[mydata$Country %in% region.oceania]<-"Oceania" mydata$region[mydata$Country %in% region.other]<-"Other" team.list<-unique(unlist(mydata$At.Bat)) team.list temp.common.support<-data.frame(teams=team.list,include=0) for(i in 1:nrow(temp.common.support)){ temp.subset1<-subset(mydata,mydata$At.Bat==temp.common.support$teams[i] & mydata$Innings==1 & mydata$Day.night==0) temp.subset2<-subset(mydata,mydata$At.Bat==temp.common.support$teams[i] & mydata$Innings==1 & mydata$Day.night==1) temp.subset3<-subset(mydata,mydata$At.Bat==temp.common.support$teams[i] & mydata$Innings==2 & mydata$Day.night==0) temp.subset4<-subset(mydata,mydata$At.Bat==temp.common.support$teams[i] & mydata$Innings==2 & mydata$Day.night==1) if(nrow(temp.subset1)>=1 & nrow(temp.subset2)>=1 & nrow(temp.subset3)>=1 & nrow(temp.subset4)>=1){ temp.common.support[i,2]<-1 } } cs<-subset(temp.common.support,temp.common.support$include==1) mydata$cs<-ifelse(mydata$At.Bat %in% cs$teams & mydata$Fielding %in% cs$teams,1,0) fulldata<-subset(mydata,mydata$Error.In.Data==0) mydata<-subset(mydata,mydata$Error.In.Data==0 & mydata$cs==1) #write.table(mydata,file="C:\\test.csv",sep=",") cs.team.list<-unique(unlist(mydata$At.Bat)) cs.team.list # # Add in the average ranking data and generate chart # quality <- read.csv("rankingdata.csv",header=TRUE) names(quality) mydata$ranking<-quality$Rank[match(mydata$At.Bat,quality$Country)] ranking<-quality$Rank dn.perc<-quality$Ppn.Day.Night*100 model<-lm(dn.perc~ranking) summary(model) se<-sqrt(diag(vcov(model))) se #Bubble Plot radius<-sqrt(quality$Total.Matches/pi) symbols(ranking,dn.perc,circles=radius,inches=0.25,fg="black",bg="#BDBDBD",xlab="Average annual ICC Points (May 1999 – December 2011)",ylab="Day Night Matches (%)",ylim=c(0,60),xlim=c(0,150)) grid() abline(model$coefficients[1],model$coefficients[2],col="red") textxy(ranking,dn.perc,quality$Country,cx=0.7) text(1.1,28,paste("Day Night (%) =",round(model$coefficients[1],1),"+",round(model$coefficients[2],1),"* Points"),col="Red",pos=4,cex=0.8) text(3.7,23,paste("(",round(se[1],1),") (",round(se[2],1),")",sep=""),col="Red",pos=4, cex=0.7) #dot plot plot(ranking,dn.perc,pch=16,xlab="Average annual ICC Points (May 1999 – December 2011)",ylab="Day Night Matches (%)",ylim=c(0,60),xlim=c(0,150)) grid() abline(model$coefficients[1],model$coefficients[2],col="red",lty=2) textxy(ranking,dn.perc,quality$Country,cx=0.7,) text(55,27,paste("Day Night (%) =",round(model$coefficients[1],1),"+",round(model$coefficients[2],1),"* Points"),col="Red",pos=4,cex=0.8) text(85,25,paste("(",round(se[1],1),") (",round(se[2],1),")",sep=""),col="Red",pos=4, cex=0.7) # # # Compute the table in the paper that shows teams preferences for batting second after winning the toss # # temp.mydata<-subset(mydata,mydata$new.game==1) temp.mydata$win.toss.field<-abs(temp.mydata$at.bat.won.toss-1) simplified.data<-data.frame(win.toss.field=temp.mydata$win.toss.field,Toss.Winner=temp.mydata$Toss.Winner,Day.night=temp.mydata$Day.night,Home.Team=temp.mydata$Home.Team,Away.Team=temp.mydata$Away.Team) coin.toss.decision<-data.frame(countries=cs.team.list,day.number=0,dn.number=0,day.win.toss=0,dn.win.toss=0,day.field=0,dn.field=0,day.ppn=0,dn.ppn=0,diff=0,pvalue=0,lower=0,upper=0,mid=0) for(i in 1:length(cs.team.list)) { # # Here you first take matches where the team won the toss, then split into day and dn # Then calculate ppn of time they chose to bat second # Then test for a statistical significant difference in the proportions # temp.team<-toString(cs.team.list[i]) temp.table.data.day<-subset(simplified.data,simplified.data$Toss.Winner==temp.team & simplified.data$Day.night==0) temp.table.data.dn<-subset(simplified.data,simplified.data$Toss.Winner==temp.team & simplified.data$Day.night==1) temp.day.average<-mean(temp.table.data.day$win.toss.field) temp.dn.average<-mean(temp.table.data.dn$win.toss.field) temp.diff<-temp.dn.average-temp.day.average test.result<-prop.test(x=c(sum(temp.table.data.dn$win.toss.field),sum(temp.table.data.day$win.toss.field)),n=c(length(temp.table.data.dn$win.toss.field),length(temp.table.data.day$win.toss.field))) coin.toss.decision[i,2]<-nrow(subset(simplified.data,(simplified.data$Home.Team==temp.team | simplified.data$Away.Team==temp.team) & simplified.data$Day.night==0)) coin.toss.decision[i,3]<-nrow(subset(simplified.data,(simplified.data$Home.Team==temp.team | simplified.data$Away.Team==temp.team) & simplified.data$Day.night==1)) coin.toss.decision[i,4]<-nrow(temp.table.data.day) coin.toss.decision[i,5]<-nrow(temp.table.data.dn) coin.toss.decision[i,6]<-sum(temp.table.data.day$win.toss.field) coin.toss.decision[i,7]<-sum(temp.table.data.dn$win.toss.field) coin.toss.decision[i,8]<-temp.day.average coin.toss.decision[i,9]<-temp.dn.average coin.toss.decision[i,10]<-temp.diff coin.toss.decision[i,11]<-test.result$p.value coin.toss.decision[i,12]<-test.result$conf.int[1] coin.toss.decision[i,13]<-test.result$conf.int[2] coin.toss.decision[i,14]<-mean(c(test.result$conf.int[1],test.result$conf.int[2])) } coin.toss.decision temp.uiw=coin.toss.decision$upper-coin.toss.decision$mid temp.liw=coin.toss.decision$mid-coin.toss.decision$lower plotCI(x=1:length(cs.team.list),y=coin.toss.decision$mid,uiw=temp.uiw,liw=temp.liw, lty = 2, ylim=c(min(coin.toss.decision$lower),max(coin.toss.decision$upper))) # #Subset to teams that lost the toss to eliminate preferences # backupdata<-mydata mydata<-subset(mydata,mydata$at.bat.won.toss==0) # #Graph the DID to motivate it # data.day.first<-subset(mydata$Runs,mydata$Innings==1 & mydata$Day.night==0) data.day.second<-subset(mydata$Runs,mydata$Innings==2 & mydata$Day.night==0) data.dn.first<-subset(mydata$Runs,mydata$Innings==1 & mydata$Day.night==1) data.dn.second<-subset(mydata$Runs,mydata$Innings==2 & mydata$Day.night==1) print("DID treatment effect") (mean(data.day.first)-mean(data.day.second))-(mean(data.dn.first)-mean(data.dn.second)) density.day.first<-density(data.day.first, bw=1) density.day.second<-density(data.day.second, bw=1) density.dn.first<-density(data.dn.first, bw=1) density.dn.second<-density(data.dn.second, bw=1) op <- par(no.readonly=TRUE) par(mfrow=c(3,2),mar=c(3.9,4.5,2,3)+.1) plot(density.day.first,xlim=c(0,36), main="", xlab="", ylab="density",cex.axis=1.25,cex.lab=1.25, bty="n") lines(density.day.second, lty=2, col=2) legend("topright", inset=0.02, legend=c("Innings 1","Innings 2"),lty=c(1,2),col=c("black","red"), bty="n") title(main="a. Distribution of runs, Day", font.main=4, family="A") plot(density.dn.first,xlim=c(0,36), main="", xlab="", ylab="density",cex.axis=1.25,cex.lab=1.25, bty="n") lines(density.dn.second, lty=2, col=2) legend("topright", inset=0.02, legend=c("Innings 1","Innings 2"),lty=c(1,2),col=c("black","red"), bty="n") title(main="b. Distribution of runs, Day-night", font.main=4, family="A") #estimate differences in density functions f.density.day.first<-approxfun(density.day.first$x,density.day.first$y,yleft=0,yright=0) f.density.day.second<-approxfun(density.day.second$x,density.day.second$y,yleft=0,yright=0) f.density.dn.first<-approxfun(density.dn.first$x,density.dn.first$y,yleft=0,yright=0) f.density.dn.second<-approxfun(density.dn.second$x,density.dn.second$y,yleft=0,yright=0) panel.a<-array(0,dim=c(3,37)) panel.b<-array(0,dim=c(3,37)) for(i in 1:37) { panel.a[1,i]<-integrate(f.density.day.first,i-1,i)$value panel.a[2,i]<-integrate(f.density.day.second,i-1,i)$value panel.a[3,i]<-panel.a[2,i]-panel.a[1,i] panel.b[1,i]<-integrate(f.density.dn.first,i-1,i)$value panel.b[2,i]<-integrate(f.density.dn.second,i-1,i)$value panel.b[3,i]<-panel.b[2,i]-panel.b[1,i] } DID.graph<-panel.b[3,]-panel.a[3,] ymax<-max(panel.a[3,],panel.b[3,],DID.graph) ymin<-min(panel.a[3,],panel.b[3,],DID.graph) windowsFonts( A=windowsFont("Georgia"), ) barplot(panel.a[3,],space=0.25, col="grey",ylim=c(ymin,ymax),border=NA, xlim=c(0,36), xlab="", ylab="Difference in probability",cex.axis=1.25,cex.lab=1.25) axis(1,lty=1,at=6*(0:6)+1,labels=c(0,6,12,18,24,30,36),cex.axis=1.25,cex.lab=1.25) title(main="c. 2nd innings - 1st innings, Day", font.main=4, family="A") barplot(panel.b[3,],space=0.25, col="grey",ylim=c(ymin,ymax),border=NA, xlim=c(0,36), xlab="", ylab="Difference in probability",cex.axis=1.25,cex.lab=1.25) axis(1,lty=1,at=6*(0:6)+1,labels=c(0,6,12,18,24,30,36),cex.axis=1.25,cex.lab=1.25) title(main="d. 2nd innings - 1st innings, Day-night", font.main=4, family="A") barplot(DID.graph,space=0.25, col="grey",ylim=c(ymin,ymax),border=NA, xlim=c(0,36), xlab="runs scored per over", ylab="Difference in probability",cex.axis=1.25,cex.lab=1.25) axis(1,lty=1,at=6*(0:6)+1,labels=c(0,6,12,18,24,30,36),cex.axis=1.25,cex.lab=1.25) title(main="e. Difference in differences", font.main=4, family="A") plot(sum(DID.graph*(0:36)),0, xlim=c(-1,1), ylim=c(0,0), xlab="runs scored per over",cex.axis=1.25,cex.lab=1.25,col="grey",yaxt="n",ylab="",bty="n",cex=3,pch=16) #axis(1,lty=1,at=6*(0:6)+1,labels=c(0,6,12,18,24,30,36),cex.axis=1.25,cex.lab=1.25) title(main="f. Weighted average", font.main=4, family="A") par(op) # # Basic DID estimates # print("Non-parametric treatment effect") sum(DID.graph*(0:36)) print("DID treatment effect") (mean(data.day.first)-mean(data.day.second))-(mean(data.dn.first)-mean(data.dn.second)) #Run DID models model1<-lm(Runs~Day.night+innings2+I(Day.night*innings2),data=mydata) summary(model1) reset(model1,power=3) bp.result1<-bptest(model1) bp.result1 if(bp.result1$p.value=temp.mydata$Target.Score-temp.mydata$Total.Runs) affected.matches affected.matches/total.matches*100 #total margin explained total.explaned<-sum(abs(temp.mydata$Over*estimated.effect))/sum(temp.mydata$Target.Score-temp.mydata$Total.Runs) total.explaned lower.total.explaned<-sum(abs(temp.mydata$Over*lower))/sum(temp.mydata$Target.Score-temp.mydata$Total.Runs) lower.total.explaned upper.total.explaned<-sum(abs(temp.mydata$Over*upper))/sum(temp.mydata$Target.Score-temp.mydata$Total.Runs) upper.total.explaned # # Sensitivity Tests # #model 7 includes ranking as a variable model7<-lm(Runs~Day.night+innings2+I(Day.night*innings2)+ranking+Over+at.bat.won.toss+region+at.bat.at.home+factor(Total.Out)+I(Over^2)+I(Over^3)+I(Over*Total.Out)+I((Over*Total.Out)^2)+I((Over*Total.Out)^3)+factor(At.Bat)+factor(Fielding),data=mydata) summary(model7) reset(model7,power=3) bptest(model7) bp.result7<-bptest(model7) bp.result7 if(bp.result7$p.value|z|)` = 2 * pnorm(abs(coef(modelW1)/std.err), lower.tail = FALSE), LL = coef(modelW1) - 1.96 * std.err, UL = coef(modelW1) + 1.96 * std.err) r.est # quasipoisson to control for some overdispersion modelW2 <- glm(Total.Out ~ Over+Day.night+innings2+I(Day.night*innings2)+region+at.bat.at.home+I(Over^2)+I(Over^3)+factor(At.Bat)+factor(Fielding),data=mydata, family="quasipoisson") summary(modelW2) cov.m1 <- vcovHC(modelW2, type = "HC0") std.err <- sqrt(diag(cov.m1)) r.est <- cbind(Estimate = coef(modelW2), `Robust SE` = std.err, `Pr(>|z|)` = 2 * pnorm(abs(coef(modelW2)/std.err), lower.tail = FALSE), LL = coef(modelW2) - 1.96 * std.err, UL = coef(modelW2) + 1.96 * std.err) r.est day.data<-mydata dn.data<-mydata day.data$Day.night<-0 day.data$innings2<-1 dn.data$Day.night<-1 dn.data$innings2<-1 day.fit<-exp(predict(modelW2,newdata=day.data)) dn.fit<-exp(predict(modelW2,newdata=dn.data)) input<-cbind(mydata$Over,day.fit,dn.fit) holder<-array(0,dim=c(50,4)) for(i in 1:50) { temp.input<-subset(input,input[,1]==i) holder[i,1]<-mean(temp.input[,1]) holder[i,2]<-mean(temp.input[,2]) holder[i,3]<-mean(temp.input[,3]) holder[i,4]<-holder[i,3]-holder[i,2] } holder plot(holder[,1],holder[,2],type="l",xlab="Over Number",ylab="Expected number of wickets lost") lines(holder[,1],holder[,3],col="red",lty=2) legend("topleft", inset=0.01, legend=c("Day only","Day-night"),lty=c(1,2),col=c("black","red"),bty="n") rm(list=ls(all=TRUE))