Welcome, Guest
Username Password: Remember me
  • Page:
  • 1

TOPIC: First R script

First R script 4 years, 2 months ago #407

#Based on Andreadis, I. (2014) Data Quality and Data Cleaning in
#Garzia, D. Marschall, S. (eds) Matching Voters with Parties and Candidates.
#Voting Advice Applications in Comparative Perspective, 
#ECPR Press, ISBN: 9781907301735 
#http://www.polres.gr/en/sites/default/files/VAA-Book-Ch6.pdf

#Download the files from http://doi.org/10.3886/E17816V3 and 
#save them to your working directory, e.g. C:/WSPTS before you start
setwd("C:/WSPTS")

#nofs is the number of statements
nofs<-31
spssfile<-"hmv201410percent-17956.sav"
lengthfile<-"lengthws-18290.csv"

#We need to read SPSS files
library(foreign)
#Store SPSS file in votematch, ignore some warnings
votematch<-read.spss(spssfile, to.data.frame=TRUE, reencode="UTF-8")
dim(votematch)
names(votematch)

#keep track of the last column
lastc<-dim(votematch)[2]

#Let's review some of the timestamps
votematch$t2[1] #this is the timestamp of the first user answering the second question
format(votematch$t1[1], scientific=FALSE)
head(format(cbind(votematch$t1,votematch$t2),scientific=FALSE))

#timestamp is the number of milliseconds since midnight Jan 1, 1970
ms<-votematch$t1[1]
s<-ms/1000
m<-s/60
h<-m/60
d<-h/24
y<-d/365 #when was this response recorded?


#keep only if votematch$t[i]>0 
for (i in 0:nofs) { 
condition<-eval(parse(text =paste("votematch$t", i, ">0", sep="")))
votematch<-votematch[condition,]
}

#keep only if votematch$t[i]<votematch$t[i+1] 
for (i in 0:(nofs-1)) { 
  condition<-eval(parse(text =paste("votematch$t", i, "<votematch$t", i+1, sep="")))
  votematch<-votematch[condition,]
}


dim(votematch)


#compute item response times, i.e. timestamp differences in seconds: 
#votematch$d[i+1]<-(votematch$t[i+1]-votematch$t[i])/1000"
for (i in 0:(nofs-1)) {
eval(parse(text=paste("votematch$d", i+1, "<-(votematch$t", i+1,
"-votematch$t", i, ")/1000", sep="")))
}
summary(votematch$d1) #observe the maximum value!

difcols<-c((lastc+1):(lastc+nofs))


#calculate the median response time for each item
median_times<-sapply(votematch[,(lastc+1):(lastc+nofs)],median)
median_times

#load the file with the length of each statement (mumber of characters without spaces)
nofchars<-read.csv(lengthfile)
chars.time<-data.frame(nofchars, median_times)


plot(chars.time)
#put labels to identify the outliers
text(chars.time$Length, chars.time$median_times, 
     as.vector(row.names(chars.time)), pos=1,cex=0.8, col="red") 

#Linear regression model
fit<-lm(chars.time$median_times~chars.time$Length)
summary(fit)

#Calculate the thresholds
threshold1<-1.4+chars.time$Length/39.375

threshold2<-1.4+chars.time$Length/28.125

threshold1
threshold2

#We need car to use recode
library(car)

#categories of cases  
#cats{i}<-recode(votematch$d{i}, "lo:threshold1[i]='Scanning';
#threshold1[i]:threshold2[i]='Skimming'; 
#threshold2[i]:hi='Rauding'",  as.factor.result=TRUE)
for (i in 1:31) {
  eval(parse(text=paste("votematch$cats", i, "<-recode(votematch$d", i, ", \"lo:",
  threshold1[i], "=1;", threshold1[i], ":" , threshold2[i], 
  "=2;", threshold2[i], ":hi=3\",  as.factor.result=FALSE)", 
  sep="")))
}



#Frequency of scannning, skimming and rauding for one of the statements (e.g 22)
table(votematch$cats22)

catcols<-c((lastc+nofs+1):(lastc+2*nofs))

#get the categories (i.e. 1=scanning, 2=skimming, 3=rauding)
cats<-votematch[,catcols]
#is the respondent scanning? T/F
fast<-cats<2
#How many fast responses? 
sumfast<-rowSums(fast)

#You can be flexible on the rule you use to drop cases,
#e.g. you can use as in the chapter: "at least half are scanning or skimming"
#or you can have a more strict rule that will cut less cases: "at least half are scanning"
#or keep only if the respondent was scanning <11 times (less than one third)
length(sumfast[sumfast>10])/length(sumfast) #percentage of dropped cases
cleandata<-votematch[sumfast<11,]
dim(cleandata)

# keep cases where other variables have valid values
table(cleandata$sex)
cleandata<-cleandata[cleandata$sex!="Missing",]
table(cleandata$age)
cleandata<-cleandata[cleandata$age!="Missing",]
table(cleandata$edu)
cleandata<-cleandata[cleandata$edu!="Missing",]
table(cleandata$interest)
cleandata<-cleandata[cleandata$interest!="Missing",]
table(cleandata$device_type_n)
cleandata<-cleandata[cleandata$device_type_n!="Mobile Device",]
cleandata<-cleandata[cleandata$device_type_n!="unknown",]
prop.table(table(cleandata$device_type_n))

# The distribution of response times per item is strongly skewed to the right
summary(cleandata$d2)
hist(cleandata$d2)
boxplot(cleandata$d2)
hist(log(cleandata$d2))
boxplot(log(cleandata$d2))

# The distribution of response times per user?
#Let's observe the first user
i<-1
summary(as.numeric(cleandata[i,difcols]))
hist(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))

#Let's observe the 60th user
i<-60
summary(as.numeric(cleandata[i,difcols]))
hist(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))

1229/60

#Exploratory data analysis
#let's see the first 100
out1<-rep(0,100)
for (i in 1:100) {
  outlier<-length(boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)$out)
  if(outlier>0) out1[i]<-1
}
# this would cut to many cases;
sum(out1)
which(out1==1)
#Let's observe the 10th user
i<-10
summary(as.numeric(cleandata[i,difcols]))
hist(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))


summary(log(as.numeric(cleandata[i,difcols])))

hist(log(as.numeric(cleandata[i,difcols]))
boxplot.stats(log(as.numeric(cleandata[i,difcols])))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))

#Exploratory data analysis after using the log function 
ex<-c(10,100,1000)
log(ex)
#this would cut less cases; let's see the first 100
out1<-rep(0,100)
for (i in 1:100) {
  outlier<-length(boxplot.stats(log(as.numeric(cleandata[i,difcols])), coef=3)$out)
  if(outlier>0) out1[i]<-1
}
sum(out1)
which(out1==1)

dim(cleandata)[1]
# Define all extreme values as missing
for (i in 1:dim(cleandata)[1]) {
  
  outlier<-length(boxplot.stats(log(as.numeric(cleandata[i,difcols])), coef=3)$out)
  if(outlier>0) {
   colsi<-which(log(cleandata[i,difcols])>= 
                  min(boxplot.stats(log(as.numeric(cleandata[i,difcols])), coef=3)$out))
   nacols<-difcols[1]-1+colsi
   cleandata[i, nacols]<-NA
  }
}
cleandata[1:11,difcols]



model<-lm(d2~sex+edu+age+device_type_n, data=cleandata)
summary(model)
Last Edit: 4 years, 2 months ago by Ioannis Andreadis.

Re: First R script 4 years, 2 months ago #408

  • Matthias
  • OFFLINE
  • Fresh Boarder
  • Posts: 1
  • Karma: 0
# Here's some R Code w.r.t the remarks
# on indexing and vectorization that I made after the last lab session
# also benchmarking shows avoiding loops in favor of indexing can
# speed up things quite a bit.

# also use the scipen options if you want to avoid exponential
# notation of large numbers
options(scipen = 4)

library(foreign)
votematch<-read.spss("ioannis.sav", to.data.frame=TRUE, reencode="UTF-8")
dim(votematch)
names(votematch)
str(votematch)
votematch$t1[1]


votematch2 <- votematch

library(microbenchmark)

microbenchmark({relevant_colums <- grep("^t",names(votematch2),value=T)
relevant_df <- votematch2[,relevant_colums]
relevant_df[relevant_df <= 0] <- NA
# replace the relevant part of votematch2 with the relevant
# that contains NAs
votematch2[,relevant_colums] <- relevant_df
# run the the na.omit over the entire data.frame
relevant_df_clean <- na.omit(votematch2)
})
  • Page:
  • 1
Time to create page: 0.11 seconds