Mitigating performance agony of R

Standard

R users often complain about the language’s performance. For shorter running time, we should avoid computationally loops, but we sometimes need to use long loops since there is only a few core functions in R for textual analysis. I wish the developers of the R core to address this issue, but we can mitigate the performance agony by using cmpfun function.

The function compiles user functions and the performance can be improved by 3 to 4 times. It is very easy to use the function. Just wrap your function like this:

#Before
yourFunction <- function(){...}

#After
library(compiler)
yourFunction <- cmpfun(function(){...})

I learned this technique from a blog post by Naom Ross.

Import UK parliamentary debate data in Python

Standard

I tried to import UK parliamentary debates into R, but it seems that Hansard reports are too large for R. R  also has very poor in handling different character coding, so I gave up with R and wrote an importer in Python. The Python script imports the XML into MySQL database.

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import division
import os, sys, string, re, time, datetime
import xml.etree.ElementTree as ET
import MySQLdb as MySQL
import HTMLParser as HTML
    
def outputConsole(values):
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(timestamp + ' ' + ' '.join(values))
    
def getEid(node):
    if('id' in node.attrib):
        return node.attrib['id']
    else:
        return ''
	
def getDate(node):
    if('id' in node.attrib):
        return node.attrib['id'].split('/')[2][0:10]
    else:
        return ''

def getSid(node):
    if('speakerid' in node.attrib and len(node.attrib['speakerid'].split('/')) == 3):
        #print(node.attrib['speakerid'])
        return node.attrib['speakerid'].split('/')[2]
    else:
        return 0
        
def getSpeaker(node):
    if('speakername' in node.attrib):
        return node.attrib['speakername'].encode('utf-8')
    else:
        return ''
    
def getTime(node):
    if('time' in node.attrib):
        parts = node.attrib['time'].split(':')
        return '%02d:%02d:%02d' % (int(parts[0]), int(parts[1]), int(parts[2]))
    else:
        return '00:00:00'
        
def getText(node):
    texts = []
    for p in speech.findall('p'):
        if p.text != None and len(p.text) &gt; 0:
            texts.append(p.text.encode('utf-8'))
    return(' | '.join(texts))

def execute(query):
    try:
        cur.execute(query)
    except MySQL.Error, e:
        print('Query error: ' + query + str(e))

if __name__ == '__main__':
    
    html = HTML.HTMLParser()
    
    db = MySQL.connect(host="localhost", user="username", passwd="password", db="immigration", charset='utf8')
    db.autocommit(True)
    cur = db.cursor()
    
    xmlDir = '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
    if os.path.isdir(xmlDir) == False:
        outputConsole(['Directory does not exist', imageDir])
        sys.exit()
    xmlFiles = [ xmlDir + '/' + xmlFile for xmlFile in os.listdir(xmlDir) if os.path.isfile(xmlDir + '/' + xmlFile) ]
    
    execute("TRUNCATE `debate`")
    #print(xmlFiles)
    for xmlFile in xmlFiles:
        outputConsole(['Import', xmlFile])
        doc = ET.parse(xmlFile, parser=None)
        for speech in doc.findall('speech'):
            eid = getEid(speech)
            date = getDate(speech)
            time = getTime(speech)
            sid = getSid(speech)
            speaker = db.escape_string(getSpeaker(speech))
            text = db.escape_string(html.unescape(getText(speech)))
            #print(db.escape_string(text) + '\n')
            query = "INSERT IGNORE INTO `debate` (`eid`, `date`, `time`, `sid`, `speaker`, `text`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % (eid, date, time, sid, speaker, text)
            execute(query)
    sys.exit()
    db.close()
   

Import UK parliamentary debate data into R

Standard

Debates in UK parliament is transcribed and published online as Hansard, but not easy to scrape all the texts from the website. A much better source of parliament debate data is ParlParse, a website of TheyWorkForYou. On the website, Hansard reports are provided in XML files. Yet, we still have to write a script to import data into R.

library(XML)

importDebates <- function(){
  
  dir <- '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
  df <- readDebateDir(dir)
  return(df)
}

readFile <- function(fileName){
  file <- file(fileName, encoding = "ISO-8859-1")
  lines <- readLines(file)
  close(file)
  lines2 <- iconv(lines, "ISO-8859-1", "UTF-8")
  text <- paste(lines2, collapse = '\n')
  text2 <- gsub('encoding="ISO-8859-1"', 'encoding="UTF-8"', text, fixed = TRUE)
  return(text2)
}
readDebateDir <- function(dir){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(date = c(), time = c(), speaker = c(), speakerId = c(), text = c())
  
  for(file in files){
    if(grepl('\\.xml$', file, ignore.case = TRUE)){
      df <- rbind(df, readDebateXML(file))
    }
  }
  return(df)
}

readDebateXML <- function(file){
  cat('Reading', file, '\n')
  #xml <- xmlParse(file, encoding = 'ISO-8859-1')

  xml <- xmlParse(readFile(file))
  #speeches <- xpathApply(xml, '//speech', 'xmlAttrs')
  dates <- c()
  times <- c()
  speakers <- c()
  speakerIds <- c()
  texts <- c()
  speeches <- getNodeSet(xml, '//speech')
  for(speech in speeches){
    values <- getSpeech(speech)
    dates <- append(dates, values[[1]])
    times <- append(times, values[[2]])
    speakers <- append(speakers, values[[3]])
    speakerIds <- append(speakerIds, values[[4]])
    texts <- append(texts, values[[5]])
  }

  df <- data.frame(date = dates, time = times, speaker = speakers, speakerId = speakerIds, text = texts)
  return(df)
}

getSpeech <- function(speech){
  #str(speech[['speakername']])
  attribs <- xmlAttrs(speech)
  #print(xmlAttrs(speech, "speakername"))
  if("speakername" %in% names(attribs)){
    speaker = getSpeaker(attribs[['speakername']])
  }else{
    speaker = ''
  }
  if("speakerid" %in% names(attribs)){
    speakerId = getSpeakerId(attribs[['speakerid']])
  }else{
    speakerId = ''
  }
  if("id" %in% names(attribs)){
    date = getDate(attribs[['id']])
  }else{
    date = ''
  }
  if("time" %in% names(attribs)){
    time = getTime(attribs[['time']])
  }else{
    time = ''
  }
  text <- getSpeechText(speech)
  return(list(date, time, speaker, speakerId, text))
  
}

getSpeaker <- function(speaker){
  speaker2 <- iconv(speaker, "ISO_8859-1", "UTF-8")
  return(speaker2)
}

getSpeechText <- function(speech){
  ps <- unlist(xpathApply(speech, './p', 'xmlValue'))
  ps2 <- removeSpechialChar(ps)
  text <- paste(unlist(ps2), collapse=' | ')
  text2 <- gsub("^\\s+|\\s+$", "", text)
  return(text2)
}

removeSpechialChar <- function(text){
  text2 <- gsub('&[a-z]+;', ' ' , text)
  return(text2)
}

getTime <- function(time){
  
  parts <- unlist(strsplit(time, ':', fixed = TRUE))
  h <- as.numeric(parts[1])
  m <- as.numeric(parts[2])
  s <- as.numeric(parts[3])
  time2 <- paste(h, m, s, sep = ':')
  return(time2)
}

getDate <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  date <- substr(parts[3], 1, 10)
  return(date)
}

getSpeakerId <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  spearkerId <- parts[3]
  return(spearkerId)
}

News data importer for R

Standard

In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir, type){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(file in files){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
      #print(file)
      if(type == 'nexis'){
        df <- rbind(df, readNexisHTML(file))
      }else if(type == 'factiva'){
        df <- rbind(df, readFactivaHTML(file))
      }
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
    
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', file, '\n')
  f <- file(file, encoding = "UTF-8")
  lines <- readLines(f)
  close(f)
  docnum <- 0
  for(i in 1:length(lines)){
    lines[i] <- gsub('<!-- Hide XML section from browser', '', lines[i])
    if(grepl('<DOC NUMBER=1>', lines[i])) docnum <- docnum + 1
    lines[i] <- gsub('<DOC NUMBER=1>', paste0('<DOC ID="doc_id_', docnum, '">'), lines[i])
    lines[i] <- gsub('<DOCFULL> -->', '<DOCFULL>', lines[i])
    lines[i] <- gsub('</DOC> -->', '</DOC>', lines[i])
  }
  html <- paste(lines, collapse='\n')

  #Write to debug
  cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
  
  #Extract elements
  doc <- htmlParse(html , encoding="UTF-8")
  ids <- paste0('doc_id_', 1:500)
  for(id in ids){
    query <- paste0('/html/body//doc[@id="', id , '"]')
    nodes <- getNodeSet(doc, query)
    if(length(nodes)){
      node <- nodes[[1]]
    }else{
      next #can not break since there are errors in HTML
    }

    pub <- ''
    #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
    pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
    if(length(pubns)){
      i <- 1
      for(pubn in pubns){
        if(grepl('DOCUMENTS$', xmlValue(pubn))){
          docnum <- i
        }
        if(i == docnum + 1){
          pub <- xmlValue(pubn)
          pub <- cleanNews(pub)
        }
        i <- i + 1
      }
    }
    if(nchar(pub) == 0) pub <- NA
    #print(pub)
    
    date <- ''
    #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
    datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
    if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
    #print(datelinens)
    dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
    date <- paste(dates[1], dates[2], dates[3], sep=' ')
    if(nchar(date) == 0) date <- NA
    
    edition <- ''
    if(length(dates) >= 5){
      edition <- paste(dates[5:length(dates)], collapse = ' ')
      edition <- cleanNews(edition)
    }
    time <- ''
    timezone <- ''
    if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
      timeline <- edition
      timeline.parts <- strsplit(timeline, ' ')[[1]]
      #print(edition)
      #print(timeline.parts)
      time <- paste(timeline.parts[1], timeline.parts[2])
      if(length(timeline.parts) > 2){
        timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
      }
      edition <- ''
      #print(time)
    }
    if(nchar(time) == 0) time <- '12:00 AM'
    if(nchar(edition) == 0) edition <- ''
    if(nchar(timezone) == 0) timezone <- ''
        
    body <- ''
    #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
    #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
    bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
    bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
    if(length(bodyns1) > length(bodyns2)){
      bodyns <- bodyns1
    }else{
      bodyns <- bodyns2
    }
    #if(is.null(bodyns)) print(node)
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse=sep)
      if(nchar(body) == 0) body <- NA
    } 
    
    by <- ''
    code <- ''
    head <- ''
    length <- 0
    #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
    attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
    attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
    if(length(attribns1) > length(attribns2)){
      attribns <- attribns1
    }else{
      attribns <- attribns2
    }
    if(length(attribns)){
      for(attribn in attribns){
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl('^BYLINE: ', attrib)){
          by <- gsub('BYLINE: ', '', attrib)
        }
        if(grepl('^LENGTH: ', attrib)){
          length <- as.numeric(gsub('[^0-9]', '', attrib))
        }
        if(grepl('^JOURNAL-CODE: ', attrib)){
          code <- gsub('JOURNAL-CODE: ', '', attrib)
        }
        if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
          head <- cleanNews(attrib)
        }
      }
    }
    if(nchar(by) == 0) by <- NA
    if(nchar(code) == 0) code <- NA
    
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(date, time))
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    
    timezones <- append(timezones, timezone)
    editions <- append(editions, edition)
    lengths <- append(lengths, length)
  }
  #print(datetimes)
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'), 
                    timezone = as.character(timezones), 
                    edition = as.factor(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
  
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  cat('Reading', file, '\n')
  
  library(XML)
  doc <- htmlParse(file, encoding="UTF-8")
  nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
  
  for(node in nodes){
    #print(node)
    
    head <- ''
    headns <- getNodeSet(node, './div[@id="hd"]')
    if(length(headns)){
      head <- xmlValue(headns[[1]])
      head <- cleanNews(head)
    }
    if(nchar(head) == 0) head <- NA
    
    body <- ''
    bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse = sep)
    }
    if(nchar(body) == 0) body <- NA
    
    pub <- ''
    length <- 0
    date <- ''
    time <- ''
    pos <- 0
    posTime <- 0
    attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
    if(length(attribns)){
      for(attribn in attribns){
        pos <- pos + 1
        #print(paste(posTime, pos))
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl(' words$', attrib)){
          length <- as.numeric(gsub(' words$', '', attrib))
        }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
          date <- attrib
          #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
        }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
          time <- attrib
          posTime <- pos
        }else if(pos == (posTime + 1)){ #publication name must be next to the time
          pub <- attrib
        }
      }
    }
    if(nchar(pub) == 0) pub <- NA
    if(nchar(date) == 0) date <- NA
    if(nchar(time) == 0) time <- NA
    
    #print(paste(pub, date[1], date[2], date[3], time, head, length))
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(pub, datetime, head, length))
    #print(body)
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    timezones <- append(timezones, '')
    editions <- append(editions, '')
    lengths <- append(lengths, length)
  }
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    timezone = as.character(timezones), 
                    edition = editions,
                    length = as.numeric(lengths), 
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))  
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

cleanReuters <- function(text){
  text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('Reuters', '', text, ignore.case = TRUE, )
  text <- cleanNews(text)
  return(text)
}

International Newsmap

Standard

I have been running a website called International Newsmap. It collects international news stories from news sites and classify them according to their geographic focus using Bayesian classifier and lexicon expansion technique. The sources of of news are English websites in the US, the UK, New Zealand, India, Singapore, Kenya, and South Africa.

International Newsmap screen shot

The main difference of International Newsmap from event discovery systems such as GDELT is its ability to create own geographic dictionary automatically.