Import UK parliamentary debate data in Python

I tried to import UK parliamentary debates into R, but it seems that Hansard reports are too large for R. R  also has very poor in handling different character coding, so I gave up with R and wrote an importer in Python. The Python script imports the XML into MySQL database.

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import division
import os, sys, string, re, time, datetime
import xml.etree.ElementTree as ET
import MySQLdb as MySQL
import HTMLParser as HTML
    
def outputConsole(values):
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(timestamp + ' ' + ' '.join(values))
    
def getEid(node):
    if('id' in node.attrib):
        return node.attrib['id']
    else:
        return ''
	
def getDate(node):
    if('id' in node.attrib):
        return node.attrib['id'].split('/')[2][0:10]
    else:
        return ''

def getSid(node):
    if('speakerid' in node.attrib and len(node.attrib['speakerid'].split('/')) == 3):
        #print(node.attrib['speakerid'])
        return node.attrib['speakerid'].split('/')[2]
    else:
        return 0
        
def getSpeaker(node):
    if('speakername' in node.attrib):
        return node.attrib['speakername'].encode('utf-8')
    else:
        return ''
    
def getTime(node):
    if('time' in node.attrib):
        parts = node.attrib['time'].split(':')
        return '%02d:%02d:%02d' % (int(parts[0]), int(parts[1]), int(parts[2]))
    else:
        return '00:00:00'
        
def getText(node):
    texts = []
    for p in speech.findall('p'):
        if p.text != None and len(p.text) > 0:
            texts.append(p.text.encode('utf-8'))
    return(' | '.join(texts))

def execute(query):
    try:
        cur.execute(query)
    except MySQL.Error, e:
        print('Query error: ' + query + str(e))

if __name__ == '__main__':
    
    html = HTML.HTMLParser()
    
    db = MySQL.connect(host="localhost", user="username", passwd="password", db="immigration", charset='utf8')
    db.autocommit(True)
    cur = db.cursor()
    
    xmlDir = '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
    if os.path.isdir(xmlDir) == False:
        outputConsole(['Directory does not exist', imageDir])
        sys.exit()
    xmlFiles = [ xmlDir + '/' + xmlFile for xmlFile in os.listdir(xmlDir) if os.path.isfile(xmlDir + '/' + xmlFile) ]
    
    execute("TRUNCATE `debate`")
    #print(xmlFiles)
    for xmlFile in xmlFiles:
        outputConsole(['Import', xmlFile])
        doc = ET.parse(xmlFile, parser=None)
        for speech in doc.findall('speech'):
            eid = getEid(speech)
            date = getDate(speech)
            time = getTime(speech)
            sid = getSid(speech)
            speaker = db.escape_string(getSpeaker(speech))
            text = db.escape_string(html.unescape(getText(speech)))
            #print(db.escape_string(text) + '\n')
            query = "INSERT IGNORE INTO `debate` (`eid`, `date`, `time`, `sid`, `speaker`, `text`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % (eid, date, time, sid, speaker, text)
            execute(query)
    sys.exit()
    db.close()
   
Posts created 113

Leave a Reply

Your email address will not be published. Required fields are marked *

Related Posts

Begin typing your search term above and press enter to search. Press ESC to cancel.

Back To Top