Sentence segmentation

Standard

I believe that sentence is the optimal unit of sentiment analysis, but splitting whole news articles into sentences is often tricky because there are a lot of quotations in news. If we simply chop up texts based on punctuations, we get quoted texts are split into different sentences. This code is meant to avoid such problems as much as possible. This code is original written for Russian language texts but should work with English now.

library(stringi)
unitize <- function(df_items, len_min=10, quote='"'){ # Input has to be data frame with 'tid' and 'body' vairables 

  df_units <- data.frame()
  for(i in 1:nrow(df_items)){
    print(i)
    body <- insertSeparator(df_items$body[i], len_min, quote)
    if(nchar(body)){
      units <- unlist(strsplit(body, '|', fixed=TRUE))
      flags <- unlist(lapply(units, function(x) grepl('[a-zA-Z0-9]', x))) # Language dependent
      units <- units[flags]
      len <- length(units)
      #print(body)
      #print(len)
      units <- stri_replace_all_fixed(units, '|', ' ') # Remove separator
      units <- stri_replace_all_regex(units, '\\s\\s+', ' ') # Remove duplicated spaces
      units <- stri_trim_both(units)
      df_temp <- data.frame(tid=rep(df_items$tid[i], len), uid=1:len, text=units, stringsAsFactors=FALSE)
      df_units <- rbind(df_units, df_temp)
    }
  }
  write.table(df_units, file='item_units.csv', sep="\t", quote=TRUE, qmethod="double")
}


insertSeparator <- function(text, len_min=10, quote){
  flag_quote <- FALSE
  flag_bracket <- FALSE
  
  text <- stri_replace_all_regex(text, '([^.!?]) \\| ', '$1 ') # Remove wrong paragraph separator
  tokens <- stri_split_fixed(text, ' ', simplify=TRUE)
  tokens2 <- c()
  len <- 0
  for(token in tokens){
    
    # Reset flag by the paragraph separator
    if(stri_detect_fixed(token, '|')){ 
      flag_quote <- FALSE 
      flag_bracket <- FALSE
      #cat("==Reset========\n")
    }
    
    # Set flags
    flag_quote <- xor(flag_quote, stri_count_fixed(token, quote) == 1) # Exlcuded one-word quotaiton
    if(stri_detect_fixed(token, '(') != stri_detect_fixed(token, ')')){ 
      if(stri_detect_fixed(token, '(')) flag_bracket <- TRUE # Exlcuded one-word bracket
      if(stri_detect_fixed(token, ')')) flag_bracket <- FALSE # Exlcuded one-word bracket
      #cat("---------------\n")
    }
  
    if(len < len_min){
      if(!stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }else{
      if(stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- 0 
      }else if(!flag_quote & !flag_bracket & stri_detect_regex(token, '([.!?])$')){
        tokens2 <- c(tokens2, token, '|') # Insert split mark
        len <- 0
      }else{
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }
    #cat(token, flag_quote, flag_bracket, len, "\n")
  }
  text2 <- paste(tokens2, collapse=' ')
  return(text2)
}

Nexis news importer updated

Standard

I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir,...){
  names <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(name in names){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){
      #print(file)
      df <- rbind(df, readNexisHTML(name, ...))
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(name, sep = ' '){
  
  heads <- c()
  bodies <- c()
  bys <- c()
  pubs <- c()
  datetimes <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', name, '\n')
  
  # HTML cleaning------------------------------------------------
  
  lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8")
  docnum <- 0
  for(i in 1:length(lines)){
    #print(lines[i])
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  lines[i+1] = '' # Fix EOF problem
  html <- paste(lines, collapse='\n')
  
  # Write to debug
  #cat(html, file="converted.html", sep="", append=FALSE)
  
  # Main process------------------------------------------------
  
  #Load as DOM object
  doc <- htmlParse(html , encoding="UTF-8")
  
  # Remove index
  indexns <- getNodeSet(doc, '/html/body//doc[.//table]')
  for(indexn in indexns){
    #print(xmlValue(indexn))
    removeNodes(indexn)
  }
  
  for(node in getNodeSet(doc, '/html/body//doc')){
    
    pub <- NA
    datetime <- NA
    head <- NA
    by <- NA
    edition <- NA
    section <- NA
    length <- NA
    
    i <- 1
    for(div in getNodeSet(node, './/div')){
      value <- cleanNews(xmlValue(div))
      #print(paste(i, value))
      
      if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){
        i <- 2
      }else if(i == 2){
        #print(paste('pub', value))
        pub <- value
        i <- 3
      }else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){
        dateline <- value
        #print(paste('date', value))
        match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)',
                     '[, ]+([0-9]{1,2})',
                     '[, ]+([0-9]{4})',
                     '([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?',
                     '([, ]+(.+))?'), value, perl=TRUE)
        date <- c()
        for(j in 1:length(attr(match, "capture.start"))){
          from <- attr(match, "capture.start")[j]
          to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j]
          date <- c(date, substr(dateline, from, to))
        }
        month <- gsub('[^a-zA-Z]', '', date[1])
        day <- gsub('[^0-9]', '', date[2])
        year <- gsub('[^0-9]', '', date[3])
        
        datetime <- format(strptime(paste(month, day, year, '12:00 AM'), 
                                   format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
        
        if(length(date) == 7){
          edition <- cleanNews(date[7])
          #print(date)
          #print(dateline)
          #print(edition)
        }
        i <- 4
        
      }else if(i == 4 & !grepl('[A-Z]+:', value)){
        head <- value # Sometimes does not exists
        i <- 8
      }else if(i >= 4 & grepl('BYLINE:', value)){
        by <- sub('BYLINE: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('SECTION:', value)){
        section <- sub('SECTION: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('LENGTH:', value)){
        length <- strsplit(value, ' ')[[1]][2]
        i <- 8
      }else if(i >= 4 & grepl('[A-Z]+:', value)){
        i <- 8
      }else if(i == 8){
        paras <- c()
        for(p in getNodeSet(div, 'p')){ 
          paras <- c(paras, cleanNews(xmlValue(p)))
        }
        if(length(paras) > 0){
          body <- paste(paras, sep = '', collapse=sep)
        }
        break()
      }
    }
    if(!is.na(body)){
      heads <- c(heads, head)
      bodies <- c(bodies, body)
      bys <- c(bys, by)
      pubs <- c(pubs, pub)
      datetimes <- c(datetimes, datetime)
      editions <- c(editions, edition)
      lengths <- c(lengths, length)
    }
  }

  
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    by = as.factor(bys), 
                    edition = as.character(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}