とりあえず過去ログ分ではうまくいき一段落したので公開してみる。
感想としては、不定型のHTMLのパースはもう嫌。見た目定型に見えても、idを付けてあるタグが違ったり、idの内容のフォーマットが途中から変わったり、要素が必ずしもタグで囲まれているわけではなかったり。こういうパターンなのかと書いてはエラーがでて修正しての繰り返し。面倒すぎる。
HTMLのパースにBeautifulSoup使ってるのに、RSSの出力はwriteで完全手作業なのはいまいち名前空間の扱いとかが分からなかったから。とりあえずバリデータは通ったからよしとする。誰かいいやつ知っていたら教えてください。
RSS Validator (feedAnalyzer)
で、なんでこんなことをしたかというと手持ちのウォークマンで聞くため。
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs from BeautifulSoup import BeautifulSoup as BS from sqlobject import * import os import datetime import urllib2 import email import email.utils import time import re import xml.sax.saxutils as saxutils galge_url = "http://www.galge.com/radio/galge/" rss_path = "/var/www/html/podcast/galge.rdf" rss_source = "/var/www/html/podcast/galge.sqlite" class LocalTimeZone(datetime.tzinfo): def __isdst(self, dt): tt = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.weekday(), 0, -1) stamp = time.mktime(tt) tt = time.localtime(stamp) return tt.tm_isdst > 0 def utcoffset(self, dt=None): if dt: if self.__isdst(dt): return datetime.timedelta(seconds=-time.altzone) return datetime.timedelta(seconds=-time.timezone) def dst(self, dt=None): return datetime.timedelta(seconds=-time.altzone+time.timezone) def tzname(self, dt=None): if dt: return time.tzname[self.__isdst(dt)] return time.tzname[0] def date_rfc2822(dt): if not dt.tzinfo: tzdelta = LocalTimeZone().utcoffset ().seconds else: tzdelta = dt.tzinfo.utcoffset ().seconds dt_tz = datetime.datetime.timetuple (dt) + (tzdelta,) return email.utils.formatdate(email.utils.mktime_tz(dt_tz)) class EntryList(SQLObject): title = StringCol(notNone=True) subtitle = StringCol() url = StringCol(notNone=True, unique=True) subtitle = StringCol() url = StringCol(notNone=True, unique=True) datatype = StringCol() summary = StringCol() date = DateTimeCol() image = StringCol() sqlhub.processConnection = connectionForURI("sqlite:"+rss_source) EntryList.createTable(ifNotExists=True) def abspath(path): return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) def toRSS(outfilepath): out = codecs.open(abspath(outfilepath),"w","utf-8") summary = u"""”金田まひる・倉田まりや”のGalge.comラジオは、美少女ゲームやフィギュア、アニメなど様々な情報をお伝えするポータルサイト・Galge.comがプロデュースするフリーダムなWebラジオです。皆さんから寄せられた情報を御紹介したり、様々な場所に突撃したりと、ユーザーの皆様と一緒に作る楽しい番組を目指しています!!""" title = u"金田まひる・倉田まりやのGalge.comラジオ" link = "http://www.galge.com/radio/galge/#radio" copyright = u"(c) 2008 Galge.com / 井桁屋" image = "http://www.galge.com/radio/galge/images/Galge_radio03.jpg" out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") out.write("""<rss xmlns:itunes=\"http://www.itunes.com/dtds/podcast-1.0.dtd\" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" version=\"2.0\">\n""") out.write("<channel>\n") out.write("\t<title>" + saxutils.escape(title) + "</title>\n") out.write("\t<link>" + link + "</link>\n") out.write("\t<language>" + "ja" + "</language>\n") out.write("\t<copyright>" + saxutils.escape(copyright) + "</copyright>\n") out.write("\t<itunes:subtitle>" + saxutils.escape(title) + "</itunes:subtitle>\n") out.write("\t<description>" + saxutils.escape(summary) + "</description>\n") out.write("\t<itunes:summary>" + saxutils.escape(summary) + "</itunes:summary>\n") out.write("\t<itunes:image href=\"" + image + "\"></itunes:image>\n") out.write("\t<lastBuildDate>" + date_rfc2822( datetime.datetime.fromtimestamp(time.time())) + "</lastBuildDate>\n") for ent in EntryList.selectBy().orderBy("date"): out.write("\t<item>") out.write("\t\t<title>" + saxutils.escape(ent.title.decode("utf-8")) + "</title>\n") out.write("\t\t<link>" + ent.url + "</link>\n") if ent.subtitle: out.write("\t\t<itunes:subtitle>" + saxutils.escape(ent.subtitle.decode("utf-8")) + "</itunes:subtitle>\n") if ent.summary: + "</itunes:subtitle>\n") if ent.summary: out.write("\t\t<itunes:summary>" + saxutils.escape(ent.summary.decode("utf-8")) + "</itunes:summary>\n") out.write("\t\t<enclosure url=\"" + ent.url + "\" type=\"" + ent.datatype + "\" length=\"1000\"></enclosure>\n") if ent.image: out.write("\t\t<itunes:image href=\"" + ent.image + "\"></itunes:image>\n") out.write("\t\t<guid>" + ent.url + "</guid>\n") out.write("\t\t<pubDate>" + date_rfc2822(ent.date) + "</pubDate>\n") out.write("\t</item>\n") out.write("</channel>\n") out.write("</rss>\n") def importBackNumber(start=1, end=110): starturl = (start // 20) + 1 endurl = (end // 20) + 1 mp3pat = re.compile(".*\.mp3$") titlepat = re.compile(u"[0-9]{2,3}",re.M) baseurl = "http://www.galge.com/radio/galge/" now_no = start conn = sqlhub.getConnection() trans = conn.transaction() trans.rollback() trans.begin() try: for c in range(starturl, endurl + 1): html = urllib2.urlopen( "http://www.galge.com/radio/galge/back%02d.html"%c) bs = BS(html.read()) for node in bs.findAll(): node.attrMap = dict(node.attrs) end_no = c * 20 + 1 if end_no > (end + 1): end_no = end + 1 for i in range(now_no, end_no): if i < 81: content = bs.findAll("a", {"name" : "%02d"%i})[0] elif i == 86: content = bs.findAll(attrs={"id" : "%02d"%96})[1] elif i == 101: content = bs.findAll(attrs={"id" : "n102"})[1] elif i < 101: content = bs.findAll(attrs={"id" : "%02d"%i})[0] else: content = bs.findAll(attrs={"id" : "n%03d"%i})[0] while True: if content.findAll("img", {"width" : 200, "border" : 1}): flag = False for img in content.findAll("img", {"width":200, "border":1}): for img in content.findAll("img", {"width":200, "border":1}): if img.attrMap["src"].rfind("galge0") > 0: flag = True break if flag: break content = content.parent title = content.h4.contents[0] if i < 81: subtitle = content.findAll("a", {"name" : "%02d"%i})[0].next elif i < 104: subtitle = content.find("table", {"class" : "txt12k", "width" : 340}).previous.previous.previous else: subtitle = content.next if isinstance(subtitle, basestring): subtitle = subtitle.replace("\r\n","").replace(" ","") else: subtitle = None if i < 44: summary = content.h4.next.next\ .replace("\r\n","").replace(" ","") summary = summary + "\n" summary = summary + content.h4.next.next.next.next\ .replace("\r\n","").replace(" ","") elif i < 104: summary = content.h4.next.next.next.next.next\ .replace("\r\n","").replace(" ","") summary = summary + "\n" summary = summary + \ content.h4.next.next.next.next.next.next.next\ .replace("\r\n","").replace(" ","") else: summary = content.h4.next.next\ .replace("\r\n","").replace(" ","") summary = summary + "\n" summary = summary + content.h4.next.next.next.next\ .replace("\r\n","").replace(" ","") for atag in content.findAll("a"): try: url = atag.attrMap["href"] except Exception: continue if mp3pat.match(url): break date_src = urllib2.urlopen(url).headers\ .getheader('Last-Modified') if date_src: date = datetime.datetime.fromtimestamp(time.mktime( email.utils.parsedate(date_src))) else: email.utils.parsedate(date_src))) else: date = None for img in content.findAll("img", {"width" : 200, "border" : 1}): image = img.attrMap["src"] if image.rfind("galge0") > 0: image = baseurl + image if isinstance(title, unicode): title = title.encode("utf-8") if isinstance(subtitle, unicode): subtitle = subtitle.encode("utf-8") if isinstance(summary, unicode): summary = summary.encode("utf-8") print title print subtitle print url EntryList(title=title, subtitle=subtitle, url=url, datatype="audio/mpeg", summary=summary, date=date, image=image) print i now_no = c * 20 + 1 trans.commit() except Exception: trans.rollback() raise def inportBackNumber01(): url = "http://www.galge.com/radio/galge/back01.html" html = urllib2.urlopen(url) print "Loading..." bs = BS(html.read()) print "Adding..." pat = re.compile(".*\.mp3$") for i in range(1,21): content = bs.findAll("a", {"name" : "%02d"%i})[0].findNext() title = content.h4.contents[0] try: subtitle = content.b.next.next.next except AttributeError: subtitle = None if not isinstance(subtitle,basestring): subtitle = None summary = content.h4.next.next summary = summary + "\n" +content.h4.next.next.next.next for atag in content.findAll("a"): url = atag.attrMap["href"] if pat.match(url): break date_src = urllib2.urlopen(url).headers.getheader('Last-Modified') if date_src: date = datetime.datetime.fromtimestamp(time.mktime( if date_src: date = datetime.datetime.fromtimestamp(time.mktime( email.utils.parsedate(date_src))) else: date = None if content.find("img",{"width":200}): image = content.find("img",{"width":200}).attrMap["src"] else: if content.parent.find("img",{"width":200}): image = content.parent.find("img",{"width":200}).attrMap["src"] else: image = None if isinstance(title, unicode): title = title.encode("utf-8") if isinstance(subtitle, unicode): subtitle = subtitle.encode("utf-8") if isinstance(summary, unicode): summary = summary.encode("utf-8") print title print url EntryList(title=title, subtitle=subtitle, url=url, datatype="audio/mpeg", summary=summary, date=date, image=image) print i if __name__ == "__main__": #importBackNumber(104, 110) toRSS(rss_path)