Sentence segmentation

Standard

I believe that sentence is the optimal unit of sentiment analysis, but splitting whole news articles into sentences is often tricky because there are a lot of quotations in news. If we simply chop up texts based on punctuations, we get quoted texts are split into different sentences. This code is meant to avoid such problems as much as possible. This code is original written for Russian language texts but should work with English now.

library(stringi)
unitize <- function(df_items, len_min=10, quote='"'){ # Input has to be data frame with 'tid' and 'body' vairables 

  df_units <- data.frame()
  for(i in 1:nrow(df_items)){
    print(i)
    body <- insertSeparator(df_items$body[i], len_min, quote)
    if(nchar(body)){
      units <- unlist(strsplit(body, '|', fixed=TRUE))
      flags <- unlist(lapply(units, function(x) grepl('[a-zA-Z0-9]', x))) # Language dependent
      units <- units[flags]
      len <- length(units)
      #print(body)
      #print(len)
      units <- stri_replace_all_fixed(units, '|', ' ') # Remove separator
      units <- stri_replace_all_regex(units, '\\s\\s+', ' ') # Remove duplicated spaces
      units <- stri_trim_both(units)
      df_temp <- data.frame(tid=rep(df_items$tid[i], len), uid=1:len, text=units, stringsAsFactors=FALSE)
      df_units <- rbind(df_units, df_temp)
    }
  }
  write.table(df_units, file='item_units.csv', sep="\t", quote=TRUE, qmethod="double")
}


insertSeparator <- function(text, len_min=10, quote){
  flag_quote <- FALSE
  flag_bracket <- FALSE
  
  text <- stri_replace_all_regex(text, '([^.!?]) \\| ', '$1 ') # Remove wrong paragraph separator
  tokens <- stri_split_fixed(text, ' ', simplify=TRUE)
  tokens2 <- c()
  len <- 0
  for(token in tokens){
    
    # Reset flag by the paragraph separator
    if(stri_detect_fixed(token, '|')){ 
      flag_quote <- FALSE 
      flag_bracket <- FALSE
      #cat("==Reset========\n")
    }
    
    # Set flags
    flag_quote <- xor(flag_quote, stri_count_fixed(token, quote) == 1) # Exlcuded one-word quotaiton
    if(stri_detect_fixed(token, '(') != stri_detect_fixed(token, ')')){ 
      if(stri_detect_fixed(token, '(')) flag_bracket <- TRUE # Exlcuded one-word bracket
      if(stri_detect_fixed(token, ')')) flag_bracket <- FALSE # Exlcuded one-word bracket
      #cat("---------------\n")
    }
  
    if(len < len_min){
      if(!stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }else{
      if(stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- 0 
      }else if(!flag_quote & !flag_bracket & stri_detect_regex(token, '([.!?])$')){
        tokens2 <- c(tokens2, token, '|') # Insert split mark
        len <- 0
      }else{
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }
    #cat(token, flag_quote, flag_bracket, len, "\n")
  }
  text2 <- paste(tokens2, collapse=' ')
  return(text2)
}

Nexis news importer updated

Standard

I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir,...){
  names <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(name in names){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){
      #print(file)
      df <- rbind(df, readNexisHTML(name, ...))
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(name, sep = ' '){
  
  heads <- c()
  bodies <- c()
  bys <- c()
  pubs <- c()
  datetimes <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', name, '\n')
  
  # HTML cleaning------------------------------------------------
  
  lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8")
  docnum <- 0
  for(i in 1:length(lines)){
    #print(lines[i])
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  lines[i+1] = '' # Fix EOF problem
  html <- paste(lines, collapse='\n')
  
  # Write to debug
  #cat(html, file="converted.html", sep="", append=FALSE)
  
  # Main process------------------------------------------------
  
  #Load as DOM object
  doc <- htmlParse(html , encoding="UTF-8")
  
  # Remove index
  indexns <- getNodeSet(doc, '/html/body//doc[.//table]')
  for(indexn in indexns){
    #print(xmlValue(indexn))
    removeNodes(indexn)
  }
  
  for(node in getNodeSet(doc, '/html/body//doc')){
    
    pub <- NA
    datetime <- NA
    head <- NA
    by <- NA
    edition <- NA
    section <- NA
    length <- NA
    
    i <- 1
    for(div in getNodeSet(node, './/div')){
      value <- cleanNews(xmlValue(div))
      #print(paste(i, value))
      
      if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){
        i <- 2
      }else if(i == 2){
        #print(paste('pub', value))
        pub <- value
        i <- 3
      }else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){
        dateline <- value
        #print(paste('date', value))
        match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)',
                     '[, ]+([0-9]{1,2})',
                     '[, ]+([0-9]{4})',
                     '([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?',
                     '([, ]+(.+))?'), value, perl=TRUE)
        date <- c()
        for(j in 1:length(attr(match, "capture.start"))){
          from <- attr(match, "capture.start")[j]
          to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j]
          date <- c(date, substr(dateline, from, to))
        }
        month <- gsub('[^a-zA-Z]', '', date[1])
        day <- gsub('[^0-9]', '', date[2])
        year <- gsub('[^0-9]', '', date[3])
        
        datetime <- format(strptime(paste(month, day, year, '12:00 AM'), 
                                   format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
        
        if(length(date) == 7){
          edition <- cleanNews(date[7])
          #print(date)
          #print(dateline)
          #print(edition)
        }
        i <- 4
        
      }else if(i == 4 & !grepl('[A-Z]+:', value)){
        head <- value # Sometimes does not exists
        i <- 8
      }else if(i >= 4 & grepl('BYLINE:', value)){
        by <- sub('BYLINE: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('SECTION:', value)){
        section <- sub('SECTION: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('LENGTH:', value)){
        length <- strsplit(value, ' ')[[1]][2]
        i <- 8
      }else if(i >= 4 & grepl('[A-Z]+:', value)){
        i <- 8
      }else if(i == 8){
        paras <- c()
        for(p in getNodeSet(div, 'p')){ 
          paras <- c(paras, cleanNews(xmlValue(p)))
        }
        if(length(paras) > 0){
          body <- paste(paras, sep = '', collapse=sep)
        }
        break()
      }
    }
    if(!is.na(body)){
      heads <- c(heads, head)
      bodies <- c(bodies, body)
      bys <- c(bys, by)
      pubs <- c(pubs, pub)
      datetimes <- c(datetimes, datetime)
      editions <- c(editions, edition)
      lengths <- c(lengths, length)
    }
  }

  
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    by = as.factor(bys), 
                    edition = as.character(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

Text analysis dictionary on psychology

Standard

My automated dictionary creation project is making good progress, and I created a psychology dictionary from a large corpus of UK news on psychology from 1990 to 2011. Scores given to each entry word is interpreted as strength of association to psychology, and the list can be truncated based on the scores.

The words are extracted using a technique that I call the collocation-of-collocation. In this technique, a pattern ‘psycholog*’ that matches ‘psychology’, ‘psychologist’, ‘psychological’, and ‘psychologically’ is given to the system, and it finds collocations of those words. Then, those collocations are used to extract words that are semantically close to psychology. This technique is meant to overcome the limitation of collocation analysis in synonym extraction that words that have the same meaning do not co-occur.

Testing immigration dictionary

Standard

After making some changes in my automated dictionary creation system, I ran a test to validate the word choice for the new immigration dictionary. Latest version contains fewer intuitively negative words with positive scores, unlike the original version.

The test was performed by comparing the computer content-analysis with human coding of the 2010 UK manifestos. X is the automated coding by the dictionary and Y is the human coding. Green and Conservative are off the 45-degree line, but still the automated coding is strongly corresponding to human coding.

UK 2010

News data importer for R

Standard

In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir, type){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(file in files){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
      #print(file)
      if(type == 'nexis'){
        df <- rbind(df, readNexisHTML(file))
      }else if(type == 'factiva'){
        df <- rbind(df, readFactivaHTML(file))
      }
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
    
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', file, '\n')
  f <- file(file, encoding = "UTF-8")
  lines <- readLines(f)
  close(f)
  docnum <- 0
  for(i in 1:length(lines)){
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  html <- paste(lines, collapse='\n')

  #Write to debug
  cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
  
  #Extract elements
  doc <- htmlParse(html , encoding="UTF-8")
  ids <- paste0('doc_id_', 1:500)
  for(id in ids){
    query <- paste0('/html/body//doc[@id="', id , '"]')
    nodes <- getNodeSet(doc, query)
    if(length(nodes)){
      node <- nodes[[1]]
    }else{
      next #can not break since there are errors in HTML
    }

    pub <- ''
    #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
    pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
    if(length(pubns)){
      i <- 1
      for(pubn in pubns){
        if(grepl('DOCUMENTS$', xmlValue(pubn))){
          docnum <- i
        }
        if(i == docnum + 1){
          pub <- xmlValue(pubn)
          pub <- cleanNews(pub)
        }
        i <- i + 1
      }
    }
    if(nchar(pub) == 0) pub <- NA
    #print(pub)
    
    date <- ''
    #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
    datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
    if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
    #print(datelinens)
    dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
    date <- paste(dates[1], dates[2], dates[3], sep=' ')
    if(nchar(date) == 0) date <- NA
    
    edition <- ''
    if(length(dates) >= 5){
      edition <- paste(dates[5:length(dates)], collapse = ' ')
      edition <- cleanNews(edition)
    }
    time <- ''
    timezone <- ''
    if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
      timeline <- edition
      timeline.parts <- strsplit(timeline, ' ')[[1]]
      #print(edition)
      #print(timeline.parts)
      time <- paste(timeline.parts[1], timeline.parts[2])
      if(length(timeline.parts) > 2){
        timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
      }
      edition <- ''
      #print(time)
    }
    if(nchar(time) == 0) time <- '12:00 AM'
    if(nchar(edition) == 0) edition <- ''
    if(nchar(timezone) == 0) timezone <- ''
        
    body <- ''
    #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
    #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
    bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
    bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
    if(length(bodyns1) > length(bodyns2)){
      bodyns <- bodyns1
    }else{
      bodyns <- bodyns2
    }
    #if(is.null(bodyns)) print(node)
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse=sep)
      if(nchar(body) == 0) body <- NA
    } 
    
    by <- ''
    code <- ''
    head <- ''
    length <- 0
    #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
    attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
    attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
    if(length(attribns1) > length(attribns2)){
      attribns <- attribns1
    }else{
      attribns <- attribns2
    }
    if(length(attribns)){
      for(attribn in attribns){
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl('^BYLINE: ', attrib)){
          by <- gsub('BYLINE: ', '', attrib)
        }
        if(grepl('^LENGTH: ', attrib)){
          length <- as.numeric(gsub('[^0-9]', '', attrib))
        }
        if(grepl('^JOURNAL-CODE: ', attrib)){
          code <- gsub('JOURNAL-CODE: ', '', attrib)
        }
        if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
          head <- cleanNews(attrib)
        }
      }
    }
    if(nchar(by) == 0) by <- NA
    if(nchar(code) == 0) code <- NA
    
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(date, time))
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    
    timezones <- append(timezones, timezone)
    editions <- append(editions, edition)
    lengths <- append(lengths, length)
  }
  #print(datetimes)
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'), 
                    timezone = as.character(timezones), 
                    edition = as.factor(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
  
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  cat('Reading', file, '\n')
  
  library(XML)
  doc <- htmlParse(file, encoding="UTF-8")
  nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
  
  for(node in nodes){
    #print(node)
    
    head <- ''
    headns <- getNodeSet(node, './div[@id="hd"]')
    if(length(headns)){
      head <- xmlValue(headns[[1]])
      head <- cleanNews(head)
    }
    if(nchar(head) == 0) head <- NA
    
    body <- ''
    bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse = sep)
    }
    if(nchar(body) == 0) body <- NA
    
    pub <- ''
    length <- 0
    date <- ''
    time <- ''
    pos <- 0
    posTime <- 0
    attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
    if(length(attribns)){
      for(attribn in attribns){
        pos <- pos + 1
        #print(paste(posTime, pos))
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl(' words$', attrib)){
          length <- as.numeric(gsub(' words$', '', attrib))
        }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
          date <- attrib
          #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
        }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
          time <- attrib
          posTime <- pos
        }else if(pos == (posTime + 1)){ #publication name must be next to the time
          pub <- attrib
        }
      }
    }
    if(nchar(pub) == 0) pub <- NA
    if(nchar(date) == 0) date <- NA
    if(nchar(time) == 0) time <- NA
    
    #print(paste(pub, date[1], date[2], date[3], time, head, length))
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(pub, datetime, head, length))
    #print(body)
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    timezones <- append(timezones, '')
    editions <- append(editions, '')
    lengths <- append(lengths, length)
  }
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    timezone = as.character(timezones), 
                    edition = editions,
                    length = as.numeric(lengths), 
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))  
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

cleanReuters <- function(text){
  text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('Reuters', '', text, ignore.case = TRUE, )
  text <- cleanNews(text)
  return(text)
}

International Newsmap

Standard

I have been running a website called International Newsmap. It collects international news stories from news sites and classify them according to their geographic focus using Bayesian classifier and lexicon expansion technique. The sources of of news are English websites in the US, the UK, New Zealand, India, Singapore, Kenya, and South Africa.

International Newsmap screen shot

The main difference of International Newsmap from event discovery systems such as GDELT is its ability to create own geographic dictionary automatically.