I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.
library(XML) #might need libxml2-dev via apt-get command
readNewsDir <- function(dir,...){
names <- list.files(dir, full.names = TRUE, recursive = TRUE)
df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
for(name in names){
#print(file)
if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){
#print(file)
df <- rbind(df, readNexisHTML(name, ...))
}
}
return(df)
}
#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(name, sep = ' '){
heads <- c()
bodies <- c()
bys <- c()
pubs <- c()
datetimes <- c()
editions <- c()
lengths <- c()
#Convert format
cat('Reading', name, '\n')
# HTML cleaning------------------------------------------------
lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8")
docnum <- 0
for(i in 1:length(lines)){
#print(lines[i])
lines[i] <- gsub('<!-- Hide XML section from browser', '', lines[i])
if(grepl('<DOC NUMBER=1>', lines[i])) docnum <- docnum + 1
lines[i] <- gsub('<DOC NUMBER=1>', paste0('<DOC ID="doc_id_', docnum, '">'), lines[i])
lines[i] <- gsub('<DOCFULL> -->', '<DOCFULL>', lines[i])
lines[i] <- gsub('</DOC> -->', '</DOC>', lines[i])
}
lines[i+1] = '' # Fix EOF problem
html <- paste(lines, collapse='\n')
# Write to debug
#cat(html, file="converted.html", sep="", append=FALSE)
# Main process------------------------------------------------
#Load as DOM object
doc <- htmlParse(html , encoding="UTF-8")
# Remove index
indexns <- getNodeSet(doc, '/html/body//doc[.//table]')
for(indexn in indexns){
#print(xmlValue(indexn))
removeNodes(indexn)
}
for(node in getNodeSet(doc, '/html/body//doc')){
pub <- NA
datetime <- NA
head <- NA
by <- NA
edition <- NA
section <- NA
length <- NA
i <- 1
for(div in getNodeSet(node, './/div')){
value <- cleanNews(xmlValue(div))
#print(paste(i, value))
if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){
i <- 2
}else if(i == 2){
#print(paste('pub', value))
pub <- value
i <- 3
}else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){
dateline <- value
#print(paste('date', value))
match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)',
'[, ]+([0-9]{1,2})',
'[, ]+([0-9]{4})',
'([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?',
'([, ]+(.+))?'), value, perl=TRUE)
date <- c()
for(j in 1:length(attr(match, "capture.start"))){
from <- attr(match, "capture.start")[j]
to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j]
date <- c(date, substr(dateline, from, to))
}
month <- gsub('[^a-zA-Z]', '', date[1])
day <- gsub('[^0-9]', '', date[2])
year <- gsub('[^0-9]', '', date[3])
datetime <- format(strptime(paste(month, day, year, '12:00 AM'),
format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
if(length(date) == 7){
edition <- cleanNews(date[7])
#print(date)
#print(dateline)
#print(edition)
}
i <- 4
}else if(i == 4 & !grepl('[A-Z]+:', value)){
head <- value # Sometimes does not exists
i <- 8
}else if(i >= 4 & grepl('BYLINE:', value)){
by <- sub('BYLINE: ', '', value)
i <- 8
}else if(i >= 4 & grepl('SECTION:', value)){
section <- sub('SECTION: ', '', value)
i <- 8
}else if(i >= 4 & grepl('LENGTH:', value)){
length <- strsplit(value, ' ')[[1]][2]
i <- 8
}else if(i >= 4 & grepl('[A-Z]+:', value)){
i <- 8
}else if(i == 8){
paras <- c()
for(p in getNodeSet(div, 'p')){
paras <- c(paras, cleanNews(xmlValue(p)))
}
if(length(paras) > 0){
body <- paste(paras, sep = '', collapse=sep)
}
break()
}
}
if(!is.na(body)){
heads <- c(heads, head)
bodies <- c(bodies, body)
bys <- c(bys, by)
pubs <- c(pubs, pub)
datetimes <- c(datetimes, datetime)
editions <- c(editions, edition)
lengths <- c(lengths, length)
}
}
return(data.frame(head = as.character(heads),
pub = as.character(pubs),
datetime = as.POSIXct(datetimes, tz = 'UTC'),
by = as.factor(bys),
edition = as.character(editions),
length = as.numeric(lengths),
body = as.character(bodies),
stringsAsFactors = FALSE))
}
cleanNews <- function(text){
text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
text <- gsub("\\s\\s+", " ", text)
text <- gsub("^\\s+|\\s+$", "", text)
return(text)
}