Sentence segmentation

Standard

I believe that sentence is the optimal unit of sentiment analysis, but splitting whole news articles into sentences is often tricky because there are a lot of quotations in news. If we simply chop up texts based on punctuations, we get quoted texts are split into different sentences. This code is meant to avoid such problems as much as possible. This code is original written for Russian language texts but should work with English now.

library(stringi)
unitize <- function(df_items, len_min=10, quote='"'){ # Input has to be data frame with 'tid' and 'body' vairables 

  df_units <- data.frame()
  for(i in 1:nrow(df_items)){
    print(i)
    body <- insertSeparator(df_items$body[i], len_min, quote)
    if(nchar(body)){
      units <- unlist(strsplit(body, '|', fixed=TRUE))
      flags <- unlist(lapply(units, function(x) grepl('[a-zA-Z0-9]', x))) # Language dependent
      units <- units[flags]
      len <- length(units)
      #print(body)
      #print(len)
      units <- stri_replace_all_fixed(units, '|', ' ') # Remove separator
      units <- stri_replace_all_regex(units, '\\s\\s+', ' ') # Remove duplicated spaces
      units <- stri_trim_both(units)
      df_temp <- data.frame(tid=rep(df_items$tid[i], len), uid=1:len, text=units, stringsAsFactors=FALSE)
      df_units <- rbind(df_units, df_temp)
    }
  }
  write.table(df_units, file='item_units.csv', sep="\t", quote=TRUE, qmethod="double")
}


insertSeparator <- function(text, len_min=10, quote){
  flag_quote <- FALSE
  flag_bracket <- FALSE
  
  text <- stri_replace_all_regex(text, '([^.!?]) \\| ', '$1 ') # Remove wrong paragraph separator
  tokens <- stri_split_fixed(text, ' ', simplify=TRUE)
  tokens2 <- c()
  len <- 0
  for(token in tokens){
    
    # Reset flag by the paragraph separator
    if(stri_detect_fixed(token, '|')){ 
      flag_quote <- FALSE 
      flag_bracket <- FALSE
      #cat("==Reset========\n")
    }
    
    # Set flags
    flag_quote <- xor(flag_quote, stri_count_fixed(token, quote) == 1) # Exlcuded one-word quotaiton
    if(stri_detect_fixed(token, '(') != stri_detect_fixed(token, ')')){ 
      if(stri_detect_fixed(token, '(')) flag_bracket <- TRUE # Exlcuded one-word bracket
      if(stri_detect_fixed(token, ')')) flag_bracket <- FALSE # Exlcuded one-word bracket
      #cat("---------------\n")
    }
  
    if(len < len_min){
      if(!stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }else{
      if(stri_detect_fixed(token, '|')){
        tokens2 <- c(tokens2, token)
        len <- 0 
      }else if(!flag_quote & !flag_bracket & stri_detect_regex(token, '([.!?])$')){
        tokens2 <- c(tokens2, token, '|') # Insert split mark
        len <- 0
      }else{
        tokens2 <- c(tokens2, token)
        len <- len + 1
      }
    }
    #cat(token, flag_quote, flag_bracket, len, "\n")
  }
  text2 <- paste(tokens2, collapse=' ')
  return(text2)
}

Nexis news importer updated

Standard

I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir,...){
  names <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(name in names){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){
      #print(file)
      df <- rbind(df, readNexisHTML(name, ...))
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(name, sep = ' '){
  
  heads <- c()
  bodies <- c()
  bys <- c()
  pubs <- c()
  datetimes <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', name, '\n')
  
  # HTML cleaning------------------------------------------------
  
  lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8")
  docnum <- 0
  for(i in 1:length(lines)){
    #print(lines[i])
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  lines[i+1] = '' # Fix EOF problem
  html <- paste(lines, collapse='\n')
  
  # Write to debug
  #cat(html, file="converted.html", sep="", append=FALSE)
  
  # Main process------------------------------------------------
  
  #Load as DOM object
  doc <- htmlParse(html , encoding="UTF-8")
  
  # Remove index
  indexns <- getNodeSet(doc, '/html/body//doc[.//table]')
  for(indexn in indexns){
    #print(xmlValue(indexn))
    removeNodes(indexn)
  }
  
  for(node in getNodeSet(doc, '/html/body//doc')){
    
    pub <- NA
    datetime <- NA
    head <- NA
    by <- NA
    edition <- NA
    section <- NA
    length <- NA
    
    i <- 1
    for(div in getNodeSet(node, './/div')){
      value <- cleanNews(xmlValue(div))
      #print(paste(i, value))
      
      if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){
        i <- 2
      }else if(i == 2){
        #print(paste('pub', value))
        pub <- value
        i <- 3
      }else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){
        dateline <- value
        #print(paste('date', value))
        match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)',
                     '[, ]+([0-9]{1,2})',
                     '[, ]+([0-9]{4})',
                     '([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?',
                     '([, ]+(.+))?'), value, perl=TRUE)
        date <- c()
        for(j in 1:length(attr(match, "capture.start"))){
          from <- attr(match, "capture.start")[j]
          to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j]
          date <- c(date, substr(dateline, from, to))
        }
        month <- gsub('[^a-zA-Z]', '', date[1])
        day <- gsub('[^0-9]', '', date[2])
        year <- gsub('[^0-9]', '', date[3])
        
        datetime <- format(strptime(paste(month, day, year, '12:00 AM'), 
                                   format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
        
        if(length(date) == 7){
          edition <- cleanNews(date[7])
          #print(date)
          #print(dateline)
          #print(edition)
        }
        i <- 4
        
      }else if(i == 4 & !grepl('[A-Z]+:', value)){
        head <- value # Sometimes does not exists
        i <- 8
      }else if(i >= 4 & grepl('BYLINE:', value)){
        by <- sub('BYLINE: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('SECTION:', value)){
        section <- sub('SECTION: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('LENGTH:', value)){
        length <- strsplit(value, ' ')[[1]][2]
        i <- 8
      }else if(i >= 4 & grepl('[A-Z]+:', value)){
        i <- 8
      }else if(i == 8){
        paras <- c()
        for(p in getNodeSet(div, 'p')){ 
          paras <- c(paras, cleanNews(xmlValue(p)))
        }
        if(length(paras) > 0){
          body <- paste(paras, sep = '', collapse=sep)
        }
        break()
      }
    }
    if(!is.na(body)){
      heads <- c(heads, head)
      bodies <- c(bodies, body)
      bys <- c(bys, by)
      pubs <- c(pubs, pub)
      datetimes <- c(datetimes, datetime)
      editions <- c(editions, edition)
      lengths <- c(lengths, length)
    }
  }

  
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    by = as.factor(bys), 
                    edition = as.character(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

Factor analysis in R and Python

Standard

Python has a number of statistical modules that allows us to perform analysis without R, but it is always good idea to compare the outputs of different implementations. I performed factor analysis using Scikit-learn module of Python for my dictionary creation system, but the outputs were completely different from that of R’s factanal function just like someone’s post to stackoverflow. After long hours, I finally found that it is because I had’t have normalized data for Scikit-learn. Factanal does normalization automatically, but Scikit-learn doesn’t. The right way of performing factor analysis must be this:

from sklearn import decomposition, preprocessing

data_normal = preprocessing.scale(data) # Normalization
fa = decomposition.FactorAnalysis(n_components=1)
fa.fit(data_normal)
print fa.components_ # Factor loadings

If you I do like this, factor loadings estimated by Scikit-learn become very close to R’s estimates:

# Python (Scikit-learn)
1: 0.24705429
2: 0.56100678
3: 0.48559474
4: 0.54208185
5: 0.50989289
6: 0.33028625
7: 0.38651951

# R (factanal)
1: 0.285719656390773
2: 0.633553717909623
3: 0.493731965398187
4: 0.527418210503982
5: 0.487150249901473
6: 0.312724093202758
7: 0.378827084637606

Import UK parliamentary debate data into R

Standard

Debates in UK parliament is transcribed and published online as Hansard, but not easy to scrape all the texts from the website. A much better source of parliament debate data is ParlParse, a website of TheyWorkForYou. On the website, Hansard reports are provided in XML files. Yet, we still have to write a script to import data into R.

library(XML)

importDebates <- function(){
  
  dir <- '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
  df <- readDebateDir(dir)
  return(df)
}

readFile <- function(fileName){
  file <- file(fileName, encoding = "ISO-8859-1")
  lines <- readLines(file)
  close(file)
  lines2 <- iconv(lines, "ISO-8859-1", "UTF-8")
  text <- paste(lines2, collapse = '\n')
  text2 <- gsub('encoding="ISO-8859-1"', 'encoding="UTF-8"', text, fixed = TRUE)
  return(text2)
}
readDebateDir <- function(dir){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(date = c(), time = c(), speaker = c(), speakerId = c(), text = c())
  
  for(file in files){
    if(grepl('\\.xml$', file, ignore.case = TRUE)){
      df <- rbind(df, readDebateXML(file))
    }
  }
  return(df)
}

readDebateXML <- function(file){
  cat('Reading', file, '\n')
  #xml <- xmlParse(file, encoding = 'ISO-8859-1')

  xml <- xmlParse(readFile(file))
  #speeches <- xpathApply(xml, '//speech', 'xmlAttrs')
  dates <- c()
  times <- c()
  speakers <- c()
  speakerIds <- c()
  texts <- c()
  speeches <- getNodeSet(xml, '//speech')
  for(speech in speeches){
    values <- getSpeech(speech)
    dates <- append(dates, values[[1]])
    times <- append(times, values[[2]])
    speakers <- append(speakers, values[[3]])
    speakerIds <- append(speakerIds, values[[4]])
    texts <- append(texts, values[[5]])
  }

  df <- data.frame(date = dates, time = times, speaker = speakers, speakerId = speakerIds, text = texts)
  return(df)
}

getSpeech <- function(speech){
  #str(speech[['speakername']])
  attribs <- xmlAttrs(speech)
  #print(xmlAttrs(speech, "speakername"))
  if("speakername" %in% names(attribs)){
    speaker = getSpeaker(attribs[['speakername']])
  }else{
    speaker = ''
  }
  if("speakerid" %in% names(attribs)){
    speakerId = getSpeakerId(attribs[['speakerid']])
  }else{
    speakerId = ''
  }
  if("id" %in% names(attribs)){
    date = getDate(attribs[['id']])
  }else{
    date = ''
  }
  if("time" %in% names(attribs)){
    time = getTime(attribs[['time']])
  }else{
    time = ''
  }
  text <- getSpeechText(speech)
  return(list(date, time, speaker, speakerId, text))
  
}

getSpeaker <- function(speaker){
  speaker2 <- iconv(speaker, "ISO_8859-1", "UTF-8")
  return(speaker2)
}

getSpeechText <- function(speech){
  ps <- unlist(xpathApply(speech, './p', 'xmlValue'))
  ps2 <- removeSpechialChar(ps)
  text <- paste(unlist(ps2), collapse=' | ')
  text2 <- gsub("^\\s+|\\s+$", "", text)
  return(text2)
}

removeSpechialChar <- function(text){
  text2 <- gsub('&[a-z]+;', ' ' , text)
  return(text2)
}

getTime <- function(time){
  
  parts <- unlist(strsplit(time, ':', fixed = TRUE))
  h <- as.numeric(parts[1])
  m <- as.numeric(parts[2])
  s <- as.numeric(parts[3])
  time2 <- paste(h, m, s, sep = ':')
  return(time2)
}

getDate <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  date <- substr(parts[3], 1, 10)
  return(date)
}

getSpeakerId <- function(id){
  parts <- unlist(strsplit(id, '/', fixed = TRUE))
  spearkerId <- parts[3]
  return(spearkerId)
}

News data importer for R

Standard

In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir, type){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(file in files){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
      #print(file)
      if(type == 'nexis'){
        df <- rbind(df, readNexisHTML(file))
      }else if(type == 'factiva'){
        df <- rbind(df, readFactivaHTML(file))
      }
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
    
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', file, '\n')
  f <- file(file, encoding = "UTF-8")
  lines <- readLines(f)
  close(f)
  docnum <- 0
  for(i in 1:length(lines)){
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  html <- paste(lines, collapse='\n')

  #Write to debug
  cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
  
  #Extract elements
  doc <- htmlParse(html , encoding="UTF-8")
  ids <- paste0('doc_id_', 1:500)
  for(id in ids){
    query <- paste0('/html/body//doc[@id="', id , '"]')
    nodes <- getNodeSet(doc, query)
    if(length(nodes)){
      node <- nodes[[1]]
    }else{
      next #can not break since there are errors in HTML
    }

    pub <- ''
    #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
    pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
    if(length(pubns)){
      i <- 1
      for(pubn in pubns){
        if(grepl('DOCUMENTS$', xmlValue(pubn))){
          docnum <- i
        }
        if(i == docnum + 1){
          pub <- xmlValue(pubn)
          pub <- cleanNews(pub)
        }
        i <- i + 1
      }
    }
    if(nchar(pub) == 0) pub <- NA
    #print(pub)
    
    date <- ''
    #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
    datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
    if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
    #print(datelinens)
    dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
    date <- paste(dates[1], dates[2], dates[3], sep=' ')
    if(nchar(date) == 0) date <- NA
    
    edition <- ''
    if(length(dates) >= 5){
      edition <- paste(dates[5:length(dates)], collapse = ' ')
      edition <- cleanNews(edition)
    }
    time <- ''
    timezone <- ''
    if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
      timeline <- edition
      timeline.parts <- strsplit(timeline, ' ')[[1]]
      #print(edition)
      #print(timeline.parts)
      time <- paste(timeline.parts[1], timeline.parts[2])
      if(length(timeline.parts) > 2){
        timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
      }
      edition <- ''
      #print(time)
    }
    if(nchar(time) == 0) time <- '12:00 AM'
    if(nchar(edition) == 0) edition <- ''
    if(nchar(timezone) == 0) timezone <- ''
        
    body <- ''
    #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
    #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
    bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
    bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
    if(length(bodyns1) > length(bodyns2)){
      bodyns <- bodyns1
    }else{
      bodyns <- bodyns2
    }
    #if(is.null(bodyns)) print(node)
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse=sep)
      if(nchar(body) == 0) body <- NA
    } 
    
    by <- ''
    code <- ''
    head <- ''
    length <- 0
    #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
    attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
    attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
    if(length(attribns1) > length(attribns2)){
      attribns <- attribns1
    }else{
      attribns <- attribns2
    }
    if(length(attribns)){
      for(attribn in attribns){
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl('^BYLINE: ', attrib)){
          by <- gsub('BYLINE: ', '', attrib)
        }
        if(grepl('^LENGTH: ', attrib)){
          length <- as.numeric(gsub('[^0-9]', '', attrib))
        }
        if(grepl('^JOURNAL-CODE: ', attrib)){
          code <- gsub('JOURNAL-CODE: ', '', attrib)
        }
        if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
          head <- cleanNews(attrib)
        }
      }
    }
    if(nchar(by) == 0) by <- NA
    if(nchar(code) == 0) code <- NA
    
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(date, time))
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    
    timezones <- append(timezones, timezone)
    editions <- append(editions, edition)
    lengths <- append(lengths, length)
  }
  #print(datetimes)
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'), 
                    timezone = as.character(timezones), 
                    edition = as.factor(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
  
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  cat('Reading', file, '\n')
  
  library(XML)
  doc <- htmlParse(file, encoding="UTF-8")
  nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
  
  for(node in nodes){
    #print(node)
    
    head <- ''
    headns <- getNodeSet(node, './div[@id="hd"]')
    if(length(headns)){
      head <- xmlValue(headns[[1]])
      head <- cleanNews(head)
    }
    if(nchar(head) == 0) head <- NA
    
    body <- ''
    bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse = sep)
    }
    if(nchar(body) == 0) body <- NA
    
    pub <- ''
    length <- 0
    date <- ''
    time <- ''
    pos <- 0
    posTime <- 0
    attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
    if(length(attribns)){
      for(attribn in attribns){
        pos <- pos + 1
        #print(paste(posTime, pos))
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl(' words$', attrib)){
          length <- as.numeric(gsub(' words$', '', attrib))
        }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
          date <- attrib
          #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
        }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
          time <- attrib
          posTime <- pos
        }else if(pos == (posTime + 1)){ #publication name must be next to the time
          pub <- attrib
        }
      }
    }
    if(nchar(pub) == 0) pub <- NA
    if(nchar(date) == 0) date <- NA
    if(nchar(time) == 0) time <- NA
    
    #print(paste(pub, date[1], date[2], date[3], time, head, length))
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(pub, datetime, head, length))
    #print(body)
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    timezones <- append(timezones, '')
    editions <- append(editions, '')
    lengths <- append(lengths, length)
  }
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    timezone = as.character(timezones), 
                    edition = editions,
                    length = as.numeric(lengths), 
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))  
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

cleanReuters <- function(text){
  text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('Reuters', '', text, ignore.case = TRUE, )
  text <- cleanNews(text)
  return(text)
}