Debates in UK parliament is transcribed and published online as Hansard, but not easy to scrape all the texts from the website. A much better source of parliament debate data is ParlParse, a website of TheyWorkForYou. On the website, Hansard reports are provided in XML files. Yet, we still have to write a script to import data into R.
require(XML)
require(stringi)
readFile <- function(fileName) {
lines <- readLines(fileName, encoding = "UTF-8")
return(paste(lines, collapse = '\n'))
}
readDebateDir <- function(dir) {
files <- list.files(dir, full.names = TRUE, recursive = TRUE, pattern = "\\.xml$")
result <- data.frame()
for(file in files){
result <- rbind(result, readDebateXML(file))
}
return(result)
}
readDebateXML <- function(file) {
cat('Reading', file, '\n')
xml <- xmlParse(readFile(file))
result <- data.frame()
for(speech in getNodeSet(xml, '//speech')){
values <- getSpeech(speech)
temp <- data.frame(date = values[[1]],
time = values[[2]],
speaker = values[[3]],
personId = values[[4]],
text = values[[5]],
file = file, stringsAsFactors = FALSE)
result <- rbind(result, temp)
}
return(result)
}
getSpeech <- function(speech) {
attribs <- xmlAttrs(speech)
#print(xmlAttrs(speech, "speakername"))
if ("speakername" %in% names(attribs)){
speaker = attribs[['speakername']]
} else {
speaker = ""
}
if ("person_id" %in% names(attribs)){
personId = getPersonId(attribs[['person_id']])
} else {
personId = ""
}
if ("id" %in% names(attribs)){
date = getDate(attribs[['id']])
} else {
date = ""
}
if ("time" %in% names(attribs)){
time = getTime(attribs[['time']])
} else {
time = ""
}
text <- getSpeechText(speech)
list(date, time, speaker, personId, text)
}
getSpeechText <- function(x) {
ps <- unlist(xpathApply(x, './p', xmlValue))
text <- paste(unlist(ps), collapse = "\n")
return(stri_trim(text))
}
getTime <- function(x) {
parts <- unlist(stri_split_fixed(x, ':'))
h <- as.numeric(parts[1])
m <- as.numeric(parts[2])
s <- as.numeric(parts[3])
return(paste(h, m, s, sep = ':'))
}
getDate <- function(x) {
parts <- unlist(stri_split_fixed(x, '/'))
date <- stri_sub(parts[3], 1, 10)
return(date)
}
getPersonId <- function(x) {
parts <- unlist(stri_split_fixed(x, '/'))
return(parts[3])
}
# Import files (available at https://www.theyworkforyou.com/pwdata/scrapedxml/debates/)
dat <- readDebateXML("./parliament/data/debates1919-02-04a.xml")
dat <- readDebateXML("./parliament/data/debates2020-09-23b.xml")
dat_all <- readDebateDir("./parliament/data")
UPDATE on 08/10/2020: The R code is updated using the stringi package.