56 lines
1.6 KiB
Python
56 lines
1.6 KiB
Python
import urllib, re, elementtree.ElementTree as ET
|
|
|
|
class BrowseHandler:
|
|
def __init__(self) :
|
|
self.catchCharData = False
|
|
self.keyColumn = False
|
|
self.map = {}
|
|
self.lastKey = ''
|
|
self.lastVal = ''
|
|
self.titleMap = {}
|
|
|
|
def start(self, url, title = 'none'):
|
|
# Initialize all locals
|
|
self.catchCharData = False
|
|
self.keyColumn = False
|
|
self.map[url] = {}
|
|
self.currMap = self.map[url]
|
|
self.lastKey = ''
|
|
self.lastVal = ''
|
|
filename, headers = urllib.urlretrieve(url)
|
|
del headers
|
|
|
|
XML = ET.parse(filename)
|
|
elems = XML.getiterator(tag='{http://www.w3.org/1999/xhtml}td')
|
|
for each in elems:
|
|
self.keyColumn = not self.keyColumn
|
|
if self.keyColumn:
|
|
self.lastKey = each.text
|
|
else:
|
|
txt = ''
|
|
# Check for href:
|
|
refs = each.getiterator(tag='{http://www.w3.org/1999/xhtml}a')
|
|
for ref in refs:
|
|
txt = ref.get('href')
|
|
|
|
if txt == '' :
|
|
txt = each.text
|
|
|
|
self.currMap[self.lastKey] = txt
|
|
|
|
# Now do some processing:
|
|
for k, v in self.map[url].iteritems() :
|
|
if k == 'titles':
|
|
# go straight ahead and parse some more:
|
|
self.start(v)
|
|
if re.search('title\d+', k) :
|
|
self.titleMap[k] = {}
|
|
self.start(v, k)
|
|
if title != 'none':
|
|
if k == 'duration':
|
|
self.titleMap[title]['duration'] = v
|
|
elif k == 'file0':
|
|
self.titleMap[title]['file'] = v
|
|
elif k == 'chaptercount':
|
|
self.titleMap[title]['chaptercount'] = v
|
|
|