Import UK parliamentary debate data into R

Standard

Debates in UK parliament is transcribed and published online as Hansard, but not easy to scrape all the texts from the website. A much better source of parliament debate data is ParlParse, a website of TheyWorkForYou. On the website, Hansard reports are provided in XML files. Yet, we still have to write a script to import data into R.

library(XML)

importDebates <- function(){
  
  dir <- '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
  df <- readDebateDir(dir)
  return(df)
}

readFile <- function(fileName){
  file <- file(fileName, encoding = "ISO-8859-1")
  lines <- readLines(file)
  close(file)
  lines2 <- iconv(lines, "ISO-8859-1", "UTF-8")
  text <- paste(lines2, collapse = '\n')
  text2 <- gsub('encoding="ISO-8859-1"', 'encoding="UTF-8"', text, fixed = TRUE)
  return(text2)
}
readDebateDir <- function(dir){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(date = c(), time = c(), speaker = c(), speakerId = c(), text = c())
  
  for(file in files){
    if(grepl('\\.xml$', file, ignore.case = TRUE)){
      df <- rbind(df, readDebateXML(file))
    }
  }
  return(df)
}

readDebateXML <- function(file){
  cat('Reading', file, '\n')
  #xml <- xmlParse(file, encoding = 'ISO-8859-1')

  xml <- xmlParse(readFile(file))
  #speeches <- xpathApply(xml, '//speech', 'xmlAttrs')
  dates <- c()
  times <- c()
  speakers <- c()
  speakerIds <- c()
  texts <- c()
  speeches <- getNodeSet(xml, '//speech')
  for(speech in speeches){
    values <- getSpeech(speech)
    dates <- append(dates, values[[1]])
    times <- append(times, values[[2]])
    speakers <- append(speakers, values[[3]])
    speakerIds <- append(speakerIds, values[[4]])
    texts <- append(texts, values[[5]])
  }

  df <- data.frame(date = dates, time = times, speaker = speakers, speakerId = speakerIds, text = texts)
  return(df)
}

getSpeech <- function(speech){
  #str(speech[['speakername']])
  attribs <- xmlAttrs(speech)
  #print(xmlAttrs(speech, "speakername"))
  if("speakername" %in% names(attribs)){
    speaker = getSpeaker(attribs[['speakername']])
  }else{
    speaker = ''
  }
  if("speakerid" %in% names(attribs)){
    speakerId = getSpeakerId(attribs[['speakerid']])
  }else{
    speakerId = ''
  }
  if("id" %in% names(attribs)){
    date = getDate(attribs[['id']])
  }else{
    date = ''
  }
  if("time" %in% names(attribs)){
    time = getTime(attribs[['time']])
  }else{
    time = ''
  }
  text <- getSpeechText(speech)
  return(list(date, time, speaker, speakerId, text))
  
}

getSpeaker <- function(speaker){
  speaker2 <- iconv(speaker, "ISO_8859-1", "UTF-8")
  return(speaker2)
}

getSpeechText <- function(speech){
  ps <- unlist(xpathApply(speech, './p', 'xmlValue'))
  ps2 <- removeSpechialChar(ps)
  text <- paste(unlist(ps2), collapse=' | ')
  text2 <- gsub("^\\s+|\\s+$", "", text)
  return(text2)
}

removeSpechialChar <- function(text){
  text2 <- gsub('&[a-z]+;', ' ' , text)
  return(text2)
}

getTime <- function(time){
  
  parts <- unlist(strsplit(time, ':', fixed = TRUE))
  h <- as.numeric(parts[1])
  m <- as.numeric(parts[2])
  s <- as.numeric(parts[3])
  time2 <- paste(h, m, s, sep = ':')
  return(time2)
}

getDate <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  date <- substr(parts[3], 1, 10)
  return(date)
}

getSpeakerId <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  spearkerId <- parts[3]
  return(spearkerId)
}