Import UK parliamentary debate data into R

Standard

Debates in UK parliament is transcribed and published online as Hansard, but not easy to scrape all the texts from the website. A much better source of parliament debate data is ParlParse, a website of TheyWorkForYou. On the website, Hansard reports are provided in XML files. Yet, we still have to write a script to import data into R.

library(XML)

importDebates <- function(){
  
  dir <- '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
  df <- readDebateDir(dir)
  return(df)
}

readFile <- function(fileName){
  file <- file(fileName, encoding = "ISO-8859-1")
  lines <- readLines(file)
  close(file)
  lines2 <- iconv(lines, "ISO-8859-1", "UTF-8")
  text <- paste(lines2, collapse = '\n')
  text2 <- gsub('encoding="ISO-8859-1"', 'encoding="UTF-8"', text, fixed = TRUE)
  return(text2)
}
readDebateDir <- function(dir){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(date = c(), time = c(), speaker = c(), speakerId = c(), text = c())
  
  for(file in files){
    if(grepl('\\.xml$', file, ignore.case = TRUE)){
      df <- rbind(df, readDebateXML(file))
    }
  }
  return(df)
}

readDebateXML <- function(file){
  cat('Reading', file, '\n')
  #xml <- xmlParse(file, encoding = 'ISO-8859-1')

  xml <- xmlParse(readFile(file))
  #speeches <- xpathApply(xml, '//speech', 'xmlAttrs')
  dates <- c()
  times <- c()
  speakers <- c()
  speakerIds <- c()
  texts <- c()
  speeches <- getNodeSet(xml, '//speech')
  for(speech in speeches){
    values <- getSpeech(speech)
    dates <- append(dates, values[[1]])
    times <- append(times, values[[2]])
    speakers <- append(speakers, values[[3]])
    speakerIds <- append(speakerIds, values[[4]])
    texts <- append(texts, values[[5]])
  }

  df <- data.frame(date = dates, time = times, speaker = speakers, speakerId = speakerIds, text = texts)
  return(df)
}

getSpeech <- function(speech){
  #str(speech[['speakername']])
  attribs <- xmlAttrs(speech)
  #print(xmlAttrs(speech, "speakername"))
  if("speakername" %in% names(attribs)){
    speaker = getSpeaker(attribs[['speakername']])
  }else{
    speaker = ''
  }
  if("speakerid" %in% names(attribs)){
    speakerId = getSpeakerId(attribs[['speakerid']])
  }else{
    speakerId = ''
  }
  if("id" %in% names(attribs)){
    date = getDate(attribs[['id']])
  }else{
    date = ''
  }
  if("time" %in% names(attribs)){
    time = getTime(attribs[['time']])
  }else{
    time = ''
  }
  text <- getSpeechText(speech)
  return(list(date, time, speaker, speakerId, text))
  
}

getSpeaker <- function(speaker){
  speaker2 <- iconv(speaker, "ISO_8859-1", "UTF-8")
  return(speaker2)
}

getSpeechText <- function(speech){
  ps <- unlist(xpathApply(speech, './p', 'xmlValue'))
  ps2 <- removeSpechialChar(ps)
  text <- paste(unlist(ps2), collapse=' | ')
  text2 <- gsub("^\\s+|\\s+$", "", text)
  return(text2)
}

removeSpechialChar <- function(text){
  text2 <- gsub('&[a-z]+;', ' ' , text)
  return(text2)
}

getTime <- function(time){
  
  parts <- unlist(strsplit(time, ':', fixed = TRUE))
  h <- as.numeric(parts[1])
  m <- as.numeric(parts[2])
  s <- as.numeric(parts[3])
  time2 <- paste(h, m, s, sep = ':')
  return(time2)
}

getDate <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  date <- substr(parts[3], 1, 10)
  return(date)
}

getSpeakerId <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  spearkerId <- parts[3]
  return(spearkerId)
}

News data importer for R

Standard

In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir, type){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(file in files){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
      #print(file)
      if(type == 'nexis'){
        df <- rbind(df, readNexisHTML(file))
      }else if(type == 'factiva'){
        df <- rbind(df, readFactivaHTML(file))
      }
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
    
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', file, '\n')
  f <- file(file, encoding = "UTF-8")
  lines <- readLines(f)
  close(f)
  docnum <- 0
  for(i in 1:length(lines)){
    lines[i] <- gsub('<!-- Hide XML section from browser', '', lines[i])
    if(grepl('<DOC NUMBER=1>', lines[i])) docnum <- docnum + 1
    lines[i] <- gsub('<DOC NUMBER=1>', paste0('<DOC ID="doc_id_', docnum, '">'), lines[i])
    lines[i] <- gsub('<DOCFULL> -->', '<DOCFULL>', lines[i])
    lines[i] <- gsub('</DOC> -->', '</DOC>', lines[i])
  }
  html <- paste(lines, collapse='\n')

  #Write to debug
  cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
  
  #Extract elements
  doc <- htmlParse(html , encoding="UTF-8")
  ids <- paste0('doc_id_', 1:500)
  for(id in ids){
    query <- paste0('/html/body//doc[@id="', id , '"]')
    nodes <- getNodeSet(doc, query)
    if(length(nodes)){
      node <- nodes[[1]]
    }else{
      next #can not break since there are errors in HTML
    }

    pub <- ''
    #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
    pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
    if(length(pubns)){
      i <- 1
      for(pubn in pubns){
        if(grepl('DOCUMENTS$', xmlValue(pubn))){
          docnum <- i
        }
        if(i == docnum + 1){
          pub <- xmlValue(pubn)
          pub <- cleanNews(pub)
        }
        i <- i + 1
      }
    }
    if(nchar(pub) == 0) pub <- NA
    #print(pub)
    
    date <- ''
    #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
    datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
    if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
    #print(datelinens)
    dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
    date <- paste(dates[1], dates[2], dates[3], sep=' ')
    if(nchar(date) == 0) date <- NA
    
    edition <- ''
    if(length(dates) >= 5){
      edition <- paste(dates[5:length(dates)], collapse = ' ')
      edition <- cleanNews(edition)
    }
    time <- ''
    timezone <- ''
    if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
      timeline <- edition
      timeline.parts <- strsplit(timeline, ' ')[[1]]
      #print(edition)
      #print(timeline.parts)
      time <- paste(timeline.parts[1], timeline.parts[2])
      if(length(timeline.parts) > 2){
        timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
      }
      edition <- ''
      #print(time)
    }
    if(nchar(time) == 0) time <- '12:00 AM'
    if(nchar(edition) == 0) edition <- ''
    if(nchar(timezone) == 0) timezone <- ''
        
    body <- ''
    #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
    #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
    bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
    bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
    if(length(bodyns1) > length(bodyns2)){
      bodyns <- bodyns1
    }else{
      bodyns <- bodyns2
    }
    #if(is.null(bodyns)) print(node)
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse=sep)
      if(nchar(body) == 0) body <- NA
    } 
    
    by <- ''
    code <- ''
    head <- ''
    length <- 0
    #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
    attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
    attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
    if(length(attribns1) > length(attribns2)){
      attribns <- attribns1
    }else{
      attribns <- attribns2
    }
    if(length(attribns)){
      for(attribn in attribns){
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl('^BYLINE: ', attrib)){
          by <- gsub('BYLINE: ', '', attrib)
        }
        if(grepl('^LENGTH: ', attrib)){
          length <- as.numeric(gsub('[^0-9]', '', attrib))
        }
        if(grepl('^JOURNAL-CODE: ', attrib)){
          code <- gsub('JOURNAL-CODE: ', '', attrib)
        }
        if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
          head <- cleanNews(attrib)
        }
      }
    }
    if(nchar(by) == 0) by <- NA
    if(nchar(code) == 0) code <- NA
    
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(date, time))
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    
    timezones <- append(timezones, timezone)
    editions <- append(editions, edition)
    lengths <- append(lengths, length)
  }
  #print(datetimes)
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'), 
                    timezone = as.character(timezones), 
                    edition = as.factor(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
  
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  cat('Reading', file, '\n')
  
  library(XML)
  doc <- htmlParse(file, encoding="UTF-8")
  nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
  
  for(node in nodes){
    #print(node)
    
    head <- ''
    headns <- getNodeSet(node, './div[@id="hd"]')
    if(length(headns)){
      head <- xmlValue(headns[[1]])
      head <- cleanNews(head)
    }
    if(nchar(head) == 0) head <- NA
    
    body <- ''
    bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse = sep)
    }
    if(nchar(body) == 0) body <- NA
    
    pub <- ''
    length <- 0
    date <- ''
    time <- ''
    pos <- 0
    posTime <- 0
    attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
    if(length(attribns)){
      for(attribn in attribns){
        pos <- pos + 1
        #print(paste(posTime, pos))
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl(' words$', attrib)){
          length <- as.numeric(gsub(' words$', '', attrib))
        }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
          date <- attrib
          #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
        }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
          time <- attrib
          posTime <- pos
        }else if(pos == (posTime + 1)){ #publication name must be next to the time
          pub <- attrib
        }
      }
    }
    if(nchar(pub) == 0) pub <- NA
    if(nchar(date) == 0) date <- NA
    if(nchar(time) == 0) time <- NA
    
    #print(paste(pub, date[1], date[2], date[3], time, head, length))
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(pub, datetime, head, length))
    #print(body)
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    timezones <- append(timezones, '')
    editions <- append(editions, '')
    lengths <- append(lengths, length)
  }
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    timezone = as.character(timezones), 
                    edition = editions,
                    length = as.numeric(lengths), 
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))  
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

cleanReuters <- function(text){
  text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('Reuters', '', text, ignore.case = TRUE, )
  text <- cleanNews(text)
  return(text)
}