青空文庫XHTMLファイルをライブラリを使って整形する†
パッケージのインストール†
axhtml2xml.py†
import re
import sys
import glob
from bs4 import BeautifulSoup
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom.minidom import parseString
import json
with open("gaiji_map.json", "r", encoding="utf-8") as f:
GAIJI_MAP = json.load(f)
def replace_ruby_tags(soup):
"""ルビタグを一時的に|漢字《読み》形式に置換"""
for ruby in soup.find_all("ruby"):
rb = ruby.find("rb").text if ruby.find("rb") else ""
rt = ruby.find("rt").text if ruby.find("rt") else ""
ruby.replace_with(f"|{rb}《{rt}》")
return soup
def process_body(soup):
"""テキストを文ごとに分割し、XMLツリーを構築"""
root = Element("text")
body = soup.body
if not body:
raise ValueError("No <body> tag found in the input file.")
# 特定の<div>要素を抽出
bibliographical_info = None
notation_notes = None
for div in body.find_all("div"):
if div.get("id") == "card":
# <div id="card">を削除
div.decompose()
elif div.get("class") and "bibliographical_information" in div["class"]:
# <div class="bibliographical_information">の情報を保存
bibliographical_info = div.get_text(strip=True)
div.decompose()
elif div.get("class") and "notation_notes" in div["class"]:
# <div class="notation_notes">の情報を保存
notation_notes = div.get_text(strip=True)
div.decompose()
# 残りの要素を処理
text_content = body.get_text("\n", strip=True)
# 文ごとに分割
sentences = re.split(r"(?<=。|」)", text_content)
current_speech = None
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if sentence.startswith("「"):
# 発話の開始
if current_speech is None:
current_speech = SubElement(root, "speech")
s_element = SubElement(current_speech, "s")
s_element.text = sentence
if sentence.endswith("。」") or sentence.endswith("」"):
# 発話の終了
current_speech = None
elif current_speech is not None:
# 発話の継続
s_element = SubElement(current_speech, "s")
s_element.text = sentence
if sentence.endswith("。」") or sentence.endswith("」"):
# 発話の終了
current_speech = None
else:
# 通常の文
s_element = SubElement(root, "s")
s_element.text = sentence
# bibliographical_information と notation_notes を文書末尾に追加
if bibliographical_info:
biblio_element = SubElement(root, "bibliographical_information")
biblio_element.set("text", bibliographical_info)
if notation_notes:
notes_element = SubElement(root, "notation_notes")
notes_element.set("text", notation_notes)
return root
def restore_ruby_tags(root):
"""一時的なルビ表記を正式な <r> タグ形式に戻す"""
for element in root.iter():
if element.text:
# 正規表現を使って一時置換を復元
element.text = re.sub(r"|(.+?)《(.+?)》", r'<r rt="\2">\1</r>', element.text)
# ルビタグの復元後に余分な改行文字を削除
element.text = re.sub(r"\n+", "", element.text)
def replace_gaiji_tags_simple(soup):
"""<img class="gaiji">タグを対応する文字に置換"""
for img in soup.find_all("img", class_="gaiji"):
src_path = img.get("src", "")
# srcからJIS句点位置を抽出(例: "2-01-79")
match = re.search(r"(\d{1,2}-\d{2}-\d{2})", src_path)
if match:
jis_code = match.group(1)
# マッピングから対応する文字を取得
character = GAIJI_MAP.get(jis_code, "")
if character:
# imgタグを対応する文字に置き換え
img.replace_with(character)
return soup
def fix_split_quotation_marks(xml_string):
"""不適切に分割された '。」' や '。)' を修正"""
# 正規表現で `<s>` タグ内の分割された '。」' と '。)' を修正
fixed_xml = re.sub(r"</s>\s*<s>([。」)])</s>", r"\1</s>", xml_string)
return fixed_xml
def convert_file(input_file, output_file):
"""XHTMLファイルを独自XMLに変換"""
with open(input_file, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# タイトルを取得
title = soup.title.string.strip() if soup.title else "Unknown Title"
# ファイル名にタイトルを追加
base_output_file = re.sub(r"\.x?html?$", "", input_file)
output_file = f"{base_output_file}_{title}.xml"
# <img>外字タグを文字に置換
soup = replace_gaiji_tags_simple(soup)
# ルビタグを一時変換
soup = replace_ruby_tags(soup)
# Body を処理して XML ツリーを作成
root = process_body(soup)
# SampleID 属性を追加
root.attrib["sampleID"] = title
# ルビタグを正式な形式に戻す
restore_ruby_tags(root)
# XMLを出力
xml_string = tostring(root, encoding="unicode")
pretty_xml = parseString(xml_string).toprettyxml(indent=" ")
# 実体参照を解消
pretty_xml = pretty_xml.replace("<", "<").replace(">", ">").replace(""", '"')
# 不適切に分割された '。」' を修正
fixed_xml = fix_split_quotation_marks(pretty_xml)
with open(output_file, "w", encoding="utf-8") as f:
f.write(fixed_xml)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python convert.py <input_xhtml_files>")
sys.exit(1)
file_pattern = sys.argv[1]
files = glob.glob(file_pattern)
if not files:
print(f"No files matched the pattern: {file_pattern}")
sys.exit(1)
for input_file in files:
output_file = re.sub(r"\.x?html?$", ".xml", input_file)
try:
convert_file(input_file, output_file)
print(f"Converted file saved to {output_file}")
except Exception as e:
print(f"Error processing {input_file}: {e}")