Nexis news importer updated

Standard

I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir,...){
  names <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(name in names){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){
      #print(file)
      df <- rbind(df, readNexisHTML(name, ...))
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(name, sep = ' '){
  
  heads <- c()
  bodies <- c()
  bys <- c()
  pubs <- c()
  datetimes <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', name, '\n')
  
  # HTML cleaning------------------------------------------------
  
  lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8")
  docnum <- 0
  for(i in 1:length(lines)){
    #print(lines[i])
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  lines[i+1] = '' # Fix EOF problem
  html <- paste(lines, collapse='\n')
  
  # Write to debug
  #cat(html, file="converted.html", sep="", append=FALSE)
  
  # Main process------------------------------------------------
  
  #Load as DOM object
  doc <- htmlParse(html , encoding="UTF-8")
  
  # Remove index
  indexns <- getNodeSet(doc, '/html/body//doc[.//table]')
  for(indexn in indexns){
    #print(xmlValue(indexn))
    removeNodes(indexn)
  }
  
  for(node in getNodeSet(doc, '/html/body//doc')){
    
    pub <- NA
    datetime <- NA
    head <- NA
    by <- NA
    edition <- NA
    section <- NA
    length <- NA
    
    i <- 1
    for(div in getNodeSet(node, './/div')){
      value <- cleanNews(xmlValue(div))
      #print(paste(i, value))
      
      if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){
        i <- 2
      }else if(i == 2){
        #print(paste('pub', value))
        pub <- value
        i <- 3
      }else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){
        dateline <- value
        #print(paste('date', value))
        match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)',
                     '[, ]+([0-9]{1,2})',
                     '[, ]+([0-9]{4})',
                     '([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?',
                     '([, ]+(.+))?'), value, perl=TRUE)
        date <- c()
        for(j in 1:length(attr(match, "capture.start"))){
          from <- attr(match, "capture.start")[j]
          to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j]
          date <- c(date, substr(dateline, from, to))
        }
        month <- gsub('[^a-zA-Z]', '', date[1])
        day <- gsub('[^0-9]', '', date[2])
        year <- gsub('[^0-9]', '', date[3])
        
        datetime <- format(strptime(paste(month, day, year, '12:00 AM'), 
                                   format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
        
        if(length(date) == 7){
          edition <- cleanNews(date[7])
          #print(date)
          #print(dateline)
          #print(edition)
        }
        i <- 4
        
      }else if(i == 4 & !grepl('[A-Z]+:', value)){
        head <- value # Sometimes does not exists
        i <- 8
      }else if(i >= 4 & grepl('BYLINE:', value)){
        by <- sub('BYLINE: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('SECTION:', value)){
        section <- sub('SECTION: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('LENGTH:', value)){
        length <- strsplit(value, ' ')[[1]][2]
        i <- 8
      }else if(i >= 4 & grepl('[A-Z]+:', value)){
        i <- 8
      }else if(i == 8){
        paras <- c()
        for(p in getNodeSet(div, 'p')){ 
          paras <- c(paras, cleanNews(xmlValue(p)))
        }
        if(length(paras) > 0){
          body <- paste(paras, sep = '', collapse=sep)
        }
        break()
      }
    }
    if(!is.na(body)){
      heads <- c(heads, head)
      bodies <- c(bodies, body)
      bys <- c(bys, by)
      pubs <- c(pubs, pub)
      datetimes <- c(datetimes, datetime)
      editions <- c(editions, edition)
      lengths <- c(lengths, length)
    }
  }

  
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    by = as.factor(bys), 
                    edition = as.character(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

Leave a Reply

Your email address will not be published. Required fields are marked *