Sentence segmentation

I believe that sentence is the optimal unit of sentiment analysis, but splitting whole news articles into sentences is often tricky because there are a lot of quotations in news. If we simply chop up texts based on punctuations, we get quoted texts are split into different sentences. This code is meant to avoid such problems as much as possible. This code is original written for Russian language texts but should work with English now.

library(stringi)
unitize <- function(df_items, len_min=10, quote='"'){ # Input has to be data frame with 'tid' and 'body' vairables 

  df_units <- data.frame()
  for(i in 1:nrow(df_items)){
    print(i)
    body <- insertSeparator(df_items$body[i], len_min, quote)
    if(nchar(body)){
      units <- unlist(strsplit(body, '|', fixed=TRUE))
      flags <- unlist(lapply(units, function(x) grepl('[a-zA-Z0-9]', x))) # Language dependent
      units <- units[flags]
      len <- length(units)
      #print(body)
      #print(len)
      units <- stri_replace_all_fixed(units, '|', ' ') # Remove separator
      units <- stri_replace_all_regex(units, '\\s\\s+', ' ') # Remove duplicated spaces
      units <- stri_trim_both(units)
      df_temp <- data.frame(tid=rep(df_items$tid[i], len), uid=1:len, text=units, stringsAsFactors=FALSE)
      df_units <- rbind(df_units, df_temp)
    }
  }
  write.table(df_units, file='item_units.csv', sep="\t", quote=TRUE, qmethod="double")
}


insertSeparator <- function(text, len_min=10, quote){
  flag_quote <- FALSE
  flag_bracket <- FALSE
  
  text <- stri_replace_all_regex(text, '([^.!?]) \\| ', '$1 ') # Remove wrong paragraph separator
  tokens <- stri_split_fixed(text, ' ', simplify=TRUE)
  tokens2 <- c()
  len <- 0
  for(token in tokens){
    
    # Reset flag by the paragraph separator
    if(stri_detect_fixed(token, '|')){ 
      flag_quote <- FALSE 
      flag_bracket <- FALSE
      #cat("==Reset========\n")
    }
    
    # Set flags
    flag_quote <- xor(flag_quote, stri_count_fixed(token, quote) == 1) # Exlcuded one-word quotaiton
    if(stri_detect_fixed(token, '(') != stri_detect_fixed(token, ')')){ 
      if(stri_detect_fixed(token, '(')) flag_bracket <- TRUE # Exlcuded one-word bracket
      if(stri_detect_fixed(token, ')')) flag_bracket <- FALSE # Exlcuded one-word bracket
      #cat("---------------\n")
    }
  
    if(len < len_min){
      if(!stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }else{
      if(stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- 0 
      }else if(!flag_quote & !flag_bracket & stri_detect_regex(token, '([.!?])$')){
        tokens2 <- c(tokens2, token, '|') # Insert split mark
        len <- 0
      }else{
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }
    #cat(token, flag_quote, flag_bracket, len, "\n")
  }
  text2 <- paste(tokens2, collapse=' ')
  return(text2)
}
Posts created 111

Leave a Reply

Your email address will not be published. Required fields are marked *

Related Posts

Begin typing your search term above and press enter to search. Press ESC to cancel.

Back To Top