Galge.comラジオをPodCast化

とりあえず過去ログ分ではうまくいき一段落したので公開してみる。
感想としては、不定型のHTMLのパースはもう嫌。見た目定型に見えても、idを付けてあるタグが違ったり、idの内容のフォーマットが途中から変わったり、要素が必ずしもタグで囲まれているわけではなかったり。こういうパターンなのかと書いてはエラーがでて修正しての繰り返し。面倒すぎる。
HTMLのパースにBeautifulSoup使ってるのに、RSSの出力はwriteで完全手作業なのはいまいち名前空間の扱いとかが分からなかったから。とりあえずバリデータは通ったからよしとする。誰かいいやつ知っていたら教えてください。
RSS Validator (feedAnalyzer)
で、なんでこんなことをしたかというと手持ちのウォークマンで聞くため。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
from BeautifulSoup import BeautifulSoup as BS
from sqlobject import *
import os
import datetime
import urllib2
import email
import email.utils
import time
import re
import xml.sax.saxutils as saxutils

galge_url = "http://www.galge.com/radio/galge/"
rss_path = "/var/www/html/podcast/galge.rdf"
rss_source = "/var/www/html/podcast/galge.sqlite"

class LocalTimeZone(datetime.tzinfo):
    def __isdst(self, dt):
        tt = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second,
                dt.weekday(), 0, -1)
        stamp = time.mktime(tt)
        tt = time.localtime(stamp)
        return tt.tm_isdst > 0

    def utcoffset(self, dt=None):
        if dt:
            if self.__isdst(dt):
                return datetime.timedelta(seconds=-time.altzone)
        return datetime.timedelta(seconds=-time.timezone)

    def dst(self, dt=None):
        return datetime.timedelta(seconds=-time.altzone+time.timezone)

    def tzname(self, dt=None):
        if dt:
            return time.tzname[self.__isdst(dt)]
        return time.tzname[0]

def date_rfc2822(dt):
    if not dt.tzinfo:
        tzdelta = LocalTimeZone().utcoffset ().seconds
    else:
        tzdelta = dt.tzinfo.utcoffset ().seconds
    dt_tz = datetime.datetime.timetuple (dt) + (tzdelta,)
    return email.utils.formatdate(email.utils.mktime_tz(dt_tz))


class EntryList(SQLObject):
    title = StringCol(notNone=True)
    subtitle = StringCol()
    url = StringCol(notNone=True, unique=True)
    subtitle = StringCol()
    url = StringCol(notNone=True, unique=True)
    datatype = StringCol()
    summary = StringCol()
    date = DateTimeCol()
    image = StringCol()

sqlhub.processConnection = connectionForURI("sqlite:"+rss_source)
EntryList.createTable(ifNotExists=True)

def abspath(path):
    return os.path.abspath(os.path.expanduser(os.path.expandvars(path)))

def toRSS(outfilepath):
    out = codecs.open(abspath(outfilepath),"w","utf-8")
    summary = u"""”金田まひる・倉田まりや”のGalge.comラジオは、美少女ゲームやフィギュア、アニメなど様々な情報をお伝えするポータルサイト・Galge.comがプロデュースするフリーダムなWebラジオです。皆さんから寄せられた情報を御紹介したり、様々な場所に突撃したりと、ユーザーの皆様と一緒に作る楽しい番組を目指しています！！"""
    title = u"金田まひる・倉田まりやのGalge.comラジオ"
    link = "http://www.galge.com/radio/galge/#radio"
    copyright = u"(c) 2008 Galge.com / 井桁屋"
    image = "http://www.galge.com/radio/galge/images/Galge_radio03.jpg"

    out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
    out.write("""<rss
            xmlns:itunes=\"http://www.itunes.com/dtds/podcast-1.0.dtd\"
            xmlns:dc="http://purl.org/dc/elements/1.1/"
            xmlns:content="http://purl.org/rss/1.0/modules/content/"
            version=\"2.0\">\n""")
    out.write("<channel>\n")
    out.write("\t<title>" + saxutils.escape(title) + "</title>\n")
    out.write("\t<link>" + link + "</link>\n")
    out.write("\t<language>" + "ja" + "</language>\n")
    out.write("\t<copyright>" + saxutils.escape(copyright) + "</copyright>\n")
    out.write("\t<itunes:subtitle>" + saxutils.escape(title)
            + "</itunes:subtitle>\n")
    out.write("\t<description>" + saxutils.escape(summary) + "</description>\n")
    out.write("\t<itunes:summary>" + saxutils.escape(summary)
            + "</itunes:summary>\n")
    out.write("\t<itunes:image href=\"" + image + "\"></itunes:image>\n")
    out.write("\t<lastBuildDate>" + date_rfc2822(
        datetime.datetime.fromtimestamp(time.time())) + "</lastBuildDate>\n")
    for ent in EntryList.selectBy().orderBy("date"):
        out.write("\t<item>")
        out.write("\t\t<title>" + saxutils.escape(ent.title.decode("utf-8"))
                + "</title>\n")
        out.write("\t\t<link>" + ent.url + "</link>\n")
        if ent.subtitle:
            out.write("\t\t<itunes:subtitle>"
                    + saxutils.escape(ent.subtitle.decode("utf-8"))
                    + "</itunes:subtitle>\n")
        if ent.summary:
                    + "</itunes:subtitle>\n")
        if ent.summary:
            out.write("\t\t<itunes:summary>"
                    + saxutils.escape(ent.summary.decode("utf-8"))
                    + "</itunes:summary>\n")
        out.write("\t\t<enclosure url=\"" + ent.url + "\" type=\""
                + ent.datatype + "\" length=\"1000\"></enclosure>\n")
        if ent.image:
            out.write("\t\t<itunes:image href=\"" + ent.image
                    + "\"></itunes:image>\n")
        out.write("\t\t<guid>" + ent.url + "</guid>\n")
        out.write("\t\t<pubDate>" + date_rfc2822(ent.date) + "</pubDate>\n")
        out.write("\t</item>\n")
    out.write("</channel>\n")
    out.write("</rss>\n")

def importBackNumber(start=1, end=110):
    starturl = (start // 20) + 1
    endurl = (end // 20) + 1
    mp3pat = re.compile(".*\.mp3$")
    titlepat = re.compile(u"[0-9]{2,3}",re.M)
    baseurl = "http://www.galge.com/radio/galge/"
    now_no = start
    conn = sqlhub.getConnection()
    trans = conn.transaction()
    trans.rollback()
    trans.begin()
    try:
        for c in range(starturl, endurl + 1):
            html = urllib2.urlopen(
                    "http://www.galge.com/radio/galge/back%02d.html"%c)
            bs = BS(html.read())
            for node in bs.findAll():
                node.attrMap = dict(node.attrs)
            end_no = c * 20 + 1
            if end_no > (end + 1):
                end_no = end + 1
            for i in range(now_no, end_no):
                if i < 81:
                    content = bs.findAll("a", {"name" : "%02d"%i})[0]
                elif i == 86:
                    content = bs.findAll(attrs={"id" : "%02d"%96})[1]
                elif i == 101:
                    content = bs.findAll(attrs={"id" : "n102"})[1]
                elif i < 101:
                    content = bs.findAll(attrs={"id" : "%02d"%i})[0]
                else:
                    content = bs.findAll(attrs={"id" : "n%03d"%i})[0]
                while True:
                    if content.findAll("img", {"width" : 200, "border" : 1}):
                        flag = False
                        for img in content.findAll("img",
                                {"width":200, "border":1}):
                        for img in content.findAll("img",
                                {"width":200, "border":1}):
                            if img.attrMap["src"].rfind("galge0") > 0:
                                flag = True
                                break
                        if flag:
                            break
                    content = content.parent
                title = content.h4.contents[0]
                if i < 81:
                    subtitle = content.findAll("a",
                            {"name" : "%02d"%i})[0].next
                elif i < 104:
                    subtitle = content.find("table", {"class" : "txt12k",
                        "width" : 340}).previous.previous.previous
                else:
                    subtitle = content.next
                if isinstance(subtitle, basestring):
                    subtitle = subtitle.replace("\r\n","").replace(" ","")
                else:
                    subtitle = None
                if i < 44:
                    summary = content.h4.next.next\
                            .replace("\r\n","").replace(" ","")
                    summary = summary + "\n"
                    summary = summary + content.h4.next.next.next.next\
                            .replace("\r\n","").replace(" ","")
                elif i < 104:
                    summary = content.h4.next.next.next.next.next\
                            .replace("\r\n","").replace(" ","")
                    summary = summary + "\n"
                    summary = summary + \
                            content.h4.next.next.next.next.next.next.next\
                            .replace("\r\n","").replace(" ","")
                else:
                    summary = content.h4.next.next\
                            .replace("\r\n","").replace(" ","")
                    summary = summary + "\n"
                    summary = summary + content.h4.next.next.next.next\
                            .replace("\r\n","").replace(" ","")
                for atag in content.findAll("a"):
                    try:
                        url = atag.attrMap["href"]
                    except Exception:
                        continue
                    if mp3pat.match(url):
                        break
                date_src = urllib2.urlopen(url).headers\
                        .getheader('Last-Modified')
                if date_src:
                    date = datetime.datetime.fromtimestamp(time.mktime(
                        email.utils.parsedate(date_src)))
                else:
                        email.utils.parsedate(date_src)))
                else:
                    date = None
                for img in content.findAll("img",
                        {"width" : 200, "border" : 1}):
                    image = img.attrMap["src"]
                    if image.rfind("galge0") > 0:
                        image = baseurl + image
                if isinstance(title, unicode):
                    title = title.encode("utf-8")
                if isinstance(subtitle, unicode):
                    subtitle = subtitle.encode("utf-8")
                if isinstance(summary, unicode):
                    summary = summary.encode("utf-8")
                print title
                print subtitle
                print url
                EntryList(title=title, subtitle=subtitle, url=url,
                    datatype="audio/mpeg", summary=summary, date=date,
                    image=image)
                print i
            now_no = c * 20 + 1
        trans.commit()
    except Exception:
        trans.rollback()
        raise


def inportBackNumber01():
    url = "http://www.galge.com/radio/galge/back01.html"
    html = urllib2.urlopen(url)
    print "Loading..."
    bs = BS(html.read())
    print "Adding..."
    pat = re.compile(".*\.mp3$")
    for i in range(1,21):
        content = bs.findAll("a", {"name" : "%02d"%i})[0].findNext()
        title = content.h4.contents[0]
        try:
            subtitle = content.b.next.next.next
        except AttributeError:
            subtitle = None
        if not isinstance(subtitle,basestring):
            subtitle = None
        summary = content.h4.next.next
        summary = summary + "\n" +content.h4.next.next.next.next
        for atag in content.findAll("a"):
            url = atag.attrMap["href"]
            if pat.match(url):
                break
        date_src = urllib2.urlopen(url).headers.getheader('Last-Modified')
        if date_src:
            date = datetime.datetime.fromtimestamp(time.mktime(
        if date_src:
            date = datetime.datetime.fromtimestamp(time.mktime(
                email.utils.parsedate(date_src)))
        else:
            date = None
        if content.find("img",{"width":200}):
            image = content.find("img",{"width":200}).attrMap["src"]
        else:
            if content.parent.find("img",{"width":200}):
                image = content.parent.find("img",{"width":200}).attrMap["src"]
            else:
                image = None
        if isinstance(title, unicode):
            title = title.encode("utf-8")
        if isinstance(subtitle, unicode):
            subtitle = subtitle.encode("utf-8")
        if isinstance(summary, unicode):
            summary = summary.encode("utf-8")
        print title
        print url
        EntryList(title=title, subtitle=subtitle, url=url,
            datatype="audio/mpeg", summary=summary, date=date, image=image)
        print i

if __name__ == "__main__":
    #importBackNumber(104, 110)
    toRSS(rss_path)