Import UK parliamentary debate data into R

Debates in UK parliament is transcribed and published online as Hansard, but not easy to scrape all the texts from the website. A much better source of parliament debate data is ParlParse, a website of TheyWorkForYou. On the website, Hansard reports are provided in XML files. Yet, we still have to write a script to import data into R.

require(XML)
require(stringi)

readFile <- function(fileName) {
    lines <- readLines(fileName, encoding = "UTF-8")
    return(paste(lines, collapse = '\n'))
}

readDebateDir <- function(dir) {
    files <- list.files(dir, full.names = TRUE, recursive = TRUE, pattern = "\\.xml$")
    result <- data.frame()
    for(file in files){
        result <- rbind(result, readDebateXML(file))
    }
    return(result)
}

readDebateXML <- function(file) {
    cat('Reading', file, '\n')
    xml <- xmlParse(readFile(file))
    result <- data.frame()
    for(speech in getNodeSet(xml, '//speech')){
        values <- getSpeech(speech)
        temp <- data.frame(date = values[[1]], 
                           time = values[[2]], 
                           speaker = values[[3]], 
                           personId = values[[4]], 
                           text = values[[5]], 
                           file = file, stringsAsFactors = FALSE)
        result <- rbind(result, temp)
    }
    return(result)
}

getSpeech <- function(speech) {
    
    attribs <- xmlAttrs(speech)
    #print(xmlAttrs(speech, "speakername"))
    if ("speakername" %in% names(attribs)){
        speaker = attribs[['speakername']]
    } else {
        speaker = ""
    }
    if ("person_id" %in% names(attribs)){
        personId = getPersonId(attribs[['person_id']])
    } else {
        personId = ""
    }
    if ("id" %in% names(attribs)){
        date = getDate(attribs[['id']])
    } else {
        date = ""
    }
    if ("time" %in% names(attribs)){
        time = getTime(attribs[['time']])
    } else {
        time = ""
    }
    text <- getSpeechText(speech)
    list(date, time, speaker, personId, text)
}

getSpeechText <- function(x) {
    ps <- unlist(xpathApply(x, './p', xmlValue))
    text <- paste(unlist(ps), collapse = "\n")
    return(stri_trim(text))
}

getTime <- function(x) {
    
    parts <- unlist(stri_split_fixed(x, ':'))
    h <- as.numeric(parts[1])
    m <- as.numeric(parts[2])
    s <- as.numeric(parts[3])
    return(paste(h, m, s, sep = ':'))
}

getDate <- function(x) {
    parts <- unlist(stri_split_fixed(x, '/'))
    date <- stri_sub(parts[3], 1, 10)
    return(date)
}

getPersonId <- function(x) {
    parts <- unlist(stri_split_fixed(x, '/'))
    return(parts[3])
}

# Import files (available at https://www.theyworkforyou.com/pwdata/scrapedxml/debates/)
dat <- readDebateXML("./parliament/data/debates1919-02-04a.xml")
dat <- readDebateXML("./parliament/data/debates2020-09-23b.xml")
dat_all <- readDebateDir("./parliament/data")

UPDATE on 08/10/2020: The R code is updated using the stringi package.

Posts created 113

Leave a Reply

Your email address will not be published. Required fields are marked *

Related Posts

Begin typing your search term above and press enter to search. Press ESC to cancel.

Back To Top