授業資料/PythonでXML

青空文庫XHTMLファイルをライブラリを使って整形する†

パッケージのインストール†

lxml :XMLパーザ(parser)（XMLの構造を解釈してプログラムから利用しやすいようにする）
```
sudo apt install python3-lxml
```
Beautiful Soup :HTMLやXMLファイルからデータを抽出・解析する
```
sudo apt install python3-bs4
```
axhtml2xml.py†

import re
import sys
import glob
from bs4 import BeautifulSoup
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom.minidom import parseString

import json

with open("gaiji_map.json", "r", encoding="utf-8") as f:
    GAIJI_MAP = json.load(f)

def replace_ruby_tags(soup):
    """ルビタグを一時的に｜漢字《読み》形式に置換"""
    for ruby in soup.find_all("ruby"):
        rb = ruby.find("rb").text if ruby.find("rb") else ""
        rt = ruby.find("rt").text if ruby.find("rt") else ""
        ruby.replace_with(f"｜{rb}《{rt}》")
    return soup


def process_body(soup):
    """テキストを文ごとに分割し、XMLツリーを構築"""
    root = Element("text")
    body = soup.body
    if not body:
        raise ValueError("No <body> tag found in the input file.")

    # 特定の<div>要素を抽出
    bibliographical_info = None
    notation_notes = None

    for div in body.find_all("div"):
        if div.get("id") == "card":
            # <div id="card">を削除
            div.decompose()
        elif div.get("class") and "bibliographical_information" in div["class"]:
            # <div class="bibliographical_information">の情報を保存
            bibliographical_info = div.get_text(strip=True)
            div.decompose()
        elif div.get("class") and "notation_notes" in div["class"]:
            # <div class="notation_notes">の情報を保存
            notation_notes = div.get_text(strip=True)
            div.decompose()

    # 残りの要素を処理
    text_content = body.get_text("\n", strip=True)

    # 文ごとに分割
    sentences = re.split(r"(?<=。|」)", text_content)
    current_speech = None

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        if sentence.startswith("「"):
            # 発話の開始
            if current_speech is None:
                current_speech = SubElement(root, "speech")

            s_element = SubElement(current_speech, "s")
            s_element.text = sentence
            if sentence.endswith("。」") or sentence.endswith("」"):
                # 発話の終了
                current_speech = None
        elif current_speech is not None:
            # 発話の継続
            s_element = SubElement(current_speech, "s")
            s_element.text = sentence
            if sentence.endswith("。」") or sentence.endswith("」"):
                # 発話の終了
                current_speech = None
        else:
            # 通常の文
            s_element = SubElement(root, "s")
            s_element.text = sentence

    # bibliographical_information と notation_notes を文書末尾に追加
    if bibliographical_info:
        biblio_element = SubElement(root, "bibliographical_information")
        biblio_element.set("text", bibliographical_info)

    if notation_notes:
        notes_element = SubElement(root, "notation_notes")
        notes_element.set("text", notation_notes)

    return root


def restore_ruby_tags(root):
    """一時的なルビ表記を正式な <r> タグ形式に戻す"""
    for element in root.iter():
        if element.text:
            # 正規表現を使って一時置換を復元
            element.text = re.sub(r"｜(.+?)《(.+?)》", r'<r rt="\2">\1</r>', element.text)
            # ルビタグの復元後に余分な改行文字を削除
            element.text = re.sub(r"\n+", "", element.text)



def replace_gaiji_tags_simple(soup):
    """<img class="gaiji">タグを対応する文字に置換"""
    for img in soup.find_all("img", class_="gaiji"):
        src_path = img.get("src", "")
        # srcからJIS句点位置を抽出（例: "2-01-79"）
        match = re.search(r"(\d{1,2}-\d{2}-\d{2})", src_path)
        if match:
            jis_code = match.group(1)
            # マッピングから対応する文字を取得
            character = GAIJI_MAP.get(jis_code, "")
            if character:
                # imgタグを対応する文字に置き換え
                img.replace_with(character)
    return soup


def fix_split_quotation_marks(xml_string):
    """不適切に分割された '。」' や '。）' を修正"""
    # 正規表現で `<s>` タグ内の分割された '。」' と '。）' を修正
    fixed_xml = re.sub(r"</s>\s*<s>([。」）])</s>", r"\1</s>", xml_string)
    return fixed_xml


def convert_file(input_file, output_file):
    """XHTMLファイルを独自XMLに変換"""
    with open(input_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # タイトルを取得
    title = soup.title.string.strip() if soup.title else "Unknown Title"

    # ファイル名にタイトルを追加
    base_output_file = re.sub(r"\.x?html?$", "", input_file)
    output_file = f"{base_output_file}_{title}.xml"
    
    # <img>外字タグを文字に置換
    soup = replace_gaiji_tags_simple(soup)
    
    # ルビタグを一時変換
    soup = replace_ruby_tags(soup)

    # Body を処理して XML ツリーを作成
    root = process_body(soup)

    # SampleID 属性を追加
    root.attrib["sampleID"] = title

    # ルビタグを正式な形式に戻す
    restore_ruby_tags(root)

    # XMLを出力
    xml_string = tostring(root, encoding="unicode")
    pretty_xml = parseString(xml_string).toprettyxml(indent="  ")

    # 実体参照を解消
    pretty_xml = pretty_xml.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')

    # 不適切に分割された '。」' を修正
    fixed_xml = fix_split_quotation_marks(pretty_xml)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(fixed_xml)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python convert.py <input_xhtml_files>")
        sys.exit(1)

    file_pattern = sys.argv[1]
    files = glob.glob(file_pattern)

    if not files:
        print(f"No files matched the pattern: {file_pattern}")
        sys.exit(1)

    for input_file in files:
        output_file = re.sub(r"\.x?html?$", ".xml", input_file)
        try:
            convert_file(input_file, output_file)
            print(f"Converted file saved to {output_file}")
        except Exception as e:
            print(f"Error processing {input_file}: {e}")
OGISO.NET
自己紹介
OGISO.NET mail
Private
授業資料/PythonでXML

青空文庫XHTMLファイルをライブラリを使って整形する†

パッケージのインストール†

axhtml2xml.py†

最新の10件