마크다운 문서를 HTML로 변환하는 파이썬 코드(Feat.GPT-5)

GPT-5가 출시되고 나서 많은 사람들이 실망했다. 생각보다 hallucination 현상이 줄어들지 않았고 오히려 어떤 부분은 GPT-4o보다 성능이 떨어지는 느낌이 있었다고 한다.

개인적으로는 한글-영어 번역에서 GPT-5 성능이 GPT-4o보다 떨어진다고 느꼈다.

또한, ChatGPT는 토큰 수가 너무 적다. Google AI studio를 사용해본 사람은 토큰 수가 크면 얼마나 좋은지 경험해 봤을 것이다.

하지만, ChatGPT-5로 파이썬 코딩 요청을 했더니, 결과가 놀라웠다.

테이블, 리스트, 수식 등을 포함한 마크다운 문서를 HTML로 변환하는 코드를 지난 버전에서 요청할 때에는 계속 이상한 코드를 주면서 피드백을 했어야 했다. 그 피드백마저 무한 루프였기 때문에, Perplexity와 Grok을 함께 사용했어야만 했다.

그런데 GPT-5는 확실히 다르다. Thinking 모드가 아님에도 불구하고 파이썬 코드를 완벽하게 바로 작성해주었다.(일반 모델로 실행해도 Thinking 모드로 전환돼서 응답하기도 한다. 이때 Thinking 모드 사용 횟수 차감이 되지 않는다.)

뉴스에서 미국 개발자들이 대량 해고된다는 소식을 자주 접했는데, 이유를 약간이나마 알 수 있었다.

어쨌든 다음은 GPT-5가 만들어준 Markdown to HTML 파이썬 코드이다. 구글 코랩에서 실행한다.

# ⬇️ 코랩에서 이 셀만 실행: Markdown → "본문만" HTML (헤더/스타일/스크립트 없음)
!pip -q install markdown pymdown-extensions ipywidgets mdx_truly_sane_lists beautifulsoup4

import re, json
import markdown as md
import ipywidgets as widgets
from bs4 import BeautifulSoup, NavigableString
from IPython.display import display, Javascript

# --- UI ---
ta_md = widgets.Textarea(
    placeholder="여기에 Markdown 원문을 붙여넣으세요.",
    description="Markdown",
    layout=widgets.Layout(width="100%", height="300px"),
    style={"description_width":"80px"}
)
cb_toc   = widgets.Checkbox(value=False, description="[TOC] 자동 삽입")
cb_math  = widgets.Checkbox(value=True,  description="수식 래핑(arithmatex)만 적용 (추가 스크립트 없음)")
cb_force = widgets.Checkbox(value=True,  description="하위 목록 마커 강제(인라인 스타일)")
btn_make = widgets.Button(description="본문 HTML 생성", button_style="primary")
btn_copy = widgets.Button(description="복사", tooltip="HTML 소스를 클립보드로 복사")
ta_html  = widgets.Textarea(description="HTML",
    layout=widgets.Layout(width="100%", height="320px"),
    style={"description_width":"80px"}
)
msg = widgets.HTML("")
display(widgets.VBox([ta_md, widgets.HBox([cb_toc, cb_math, cb_force]),
                      widgets.HBox([btn_make, btn_copy]), ta_html, msg]))

# --- 유틸 ---
_item_head = re.compile(r'^[ ]{0,3}(?:[*+-]\s|\d+\.\s)')
_nested_item_head = re.compile(r'^[ ]{4,}(?:[*+-]\s|\d+\.\s)')

def _is_item(line:str) -> bool:
    return _item_head.match(line) is not None

# --- 정규화 ---
def normalize_md(s: str) -> str:
    # 줄바꿈/탭
    t = s.replace("\r\n", "\n").replace("\r", "\n")
    t = t.replace("\t", "    ")  # 탭 → 4칸

    # 비가시 문자 제거(제로폭·BOM 등) + NBSP류 → 공백
    for ch in ("\u200B", "\uFEFF"):  # zero-width space, BOM
        t = t.replace(ch, "")
    for ch in ("\u00A0","\u202F","\u2007","\u2060"):
        t = t.replace(ch, " ")

    # 특수 글머리(• · ㆍ) → '- ' (들여쓰기 보존)
    t = re.sub(r"^([ ]*)([•·ㆍ])\s+", r"\1- ", t, flags=re.M)

    # unordered/ordered list 마커 뒤 공백 정규화 (들여쓰기 보존)
    t = re.sub(r"^([ ]*)([*+-])(?:\s|[\u00A0\u202F\u2007])+", r"\1\2 ", t, flags=re.M)
    t = re.sub(r"^([ ]*)(\d+)[\.\)](?:\s|[\u00A0\u202F\u2007])+", r"\1\2. ", t, flags=re.M)

    # 문단 → "첫" 리스트 시작에서만 한 줄 삽입 (항목 사이엔 X)
    out = []
    prev_blank = True
    prev_list  = False
    lines = t.splitlines(keepends=True)
    for line in lines:
        is_blank = (line.strip() == "")
        is_list  = _is_item(line)
        if is_list and (not prev_blank) and (not prev_list):
            if out and out[-1].endswith("\n"):
                out.append("\n")
        out.append(line)
        prev_blank = is_blank
        prev_list  = is_list
    t = "".join(out)

    # 리스트 아이템 다음 줄이 곧장 하위 리스트이면 연결을 위해 빈 줄 1줄 삽입
    lines = t.splitlines(keepends=True)
    out = []
    for i, line in enumerate(lines):
        out.append(line)
        if i + 1 < len(lines):
            nxt = lines[i + 1]
            if _is_item(line) and (nxt.strip() != "") and _nested_item_head.match(nxt):
                out.append("\n")
    t = "".join(out)

    # 하위 목록 들여쓰기: 4의 배수로 반올림(최소 4칸)
    def _fix_indent(m):
        n = len(m.group(1))
        return " " * max(4, ((n + 3)//4)*4) + m.group(2)
    t = re.sub(r"^([ ]+)([*+-]\s|\d+\.\s)", _fix_indent, t, flags=re.M)

    return t

# --- HTML 후처리: <li><p>* ...\n* ...</p></li> → <li><ul/ol><li>...</li>...</ul/ol></li>
_bullet_pat  = re.compile(r'^\s*([*+-])\s+')
_ordered_pat = re.compile(r'^\s*(\d+)[\.\)]\s+')

def _split_p_into_lines(p):
    """<p> 내용을 줄 단위로 분해(인라인 태그 보존). 새 리스트에 '복사'해서 담기."""
    lines = [[]]
    for node in list(p.contents):  # 복사본을 순회
        if isinstance(node, NavigableString):
            parts = str(node).split('\n')
            for i, part in enumerate(parts):
                if i > 0:
                    lines.append([])
                lines[-1].append(NavigableString(part))
        else:
            lines[-1].append(node)  # 태그는 그대로 옮김
    return lines

def _extract_marker_from_line(line_nodes):
    """줄의 맨 앞 텍스트에서 리스트 마커를 떼고 (kind, matched) 반환.
       DOM은 건드리지 않고 'line_nodes' 리스트 원소만 교체한다."""
    if not line_nodes:
        return None, False
    # 첫 텍스트 노드 찾기(공백 허용)
    idx = 0
    while idx < len(line_nodes) and isinstance(line_nodes[idx], NavigableString) and str(line_nodes[idx]) == "":
        idx += 1
    if idx >= len(line_nodes) or not isinstance(line_nodes[idx], NavigableString):
        return None, False

    s = str(line_nodes[idx])
    m = _bullet_pat.match(s)
    if m:
        line_nodes[idx] = NavigableString(s[m.end():])  # 리스트 내부 데이터만 교체
        return "ul", True
    m = _ordered_pat.match(s)
    if m:
        line_nodes[idx] = NavigableString(s[m.end():])
        return "ol", True
    return None, False

def _patch_paragraph_bullets(html:str):
    soup = BeautifulSoup(html, "html.parser")
    changed = False

    for li in soup.find_all("li"):
        for p in list(li.find_all("p", recursive=False)):  # li 바로 아래 p만
            lines = _split_p_into_lines(p)

            kinds = []
            extracted = []
            for ln in lines:
                kind, ok = _extract_marker_from_line(ln)
                if ok:
                    kinds.append(kind)
                    extracted.append(ln)
                else:
                    # '*'로 시작하지 않는 줄이 섞이면 변환 중단
                    kinds = []
                    extracted = []
                    break

            # ✅ 한 줄만 있어도 하위 리스트로 변환
            if len(extracted) >= 1:
                kind = "ol" if ("ol" in kinds and "ul" not in kinds) else "ul"
                new_list = soup.new_tag(kind)
                for ln in extracted:
                    li2 = soup.new_tag("li")
                    for n in ln:
                        li2.append(n)
                    new_list.append(li2)
                p.replace_with(new_list)
                changed = True

    return str(soup), changed

def _merge_adjacent_lists(html:str):
    """같은 부모 아래 인접한 <ul>/<ul> 또는 <ol>/<ol>을 하나로 병합"""
    soup = BeautifulSoup(html, "html.parser")
    for parent in soup.find_all(True):
        i = 0
        while i < len(parent.contents):
            node = parent.contents[i]
            if getattr(node, "name", None) in ("ul", "ol"):
                j = i + 1
                while j < len(parent.contents):
                    nxt = parent.contents[j]
                    # 공백 텍스트는 건너뜀
                    if isinstance(nxt, NavigableString):
                        if str(nxt).strip() == "":
                            j += 1
                            continue
                        else:
                            break
                    if getattr(nxt, "name", None) == node.name:
                        # li만 이동
                        for li in list(nxt.find_all("li", recursive=False)):
                            node.append(li)
                        nxt.decompose()
                        # j는 같은 위치 유지(다음 형제를 계속 검사)
                    else:
                        break
                i = j
            else:
                i += 1
    return str(soup)

def _apply_marker_styles(html:str):
    soup = BeautifulSoup(html, "html.parser")

    def depth(tag):
        d = 0
        for p in tag.parents:
            if getattr(p, "name", None) in ("ul","ol"):
                d += 1
        return d

    for ul in soup.find_all("ul"):
        d = depth(ul)
        if d >= 1:
            prev = ul.get("style", "")
            marker = "circle" if d == 1 else "square"
            ul["style"] = (prev + ("; " if prev else "") + f"list-style-type: {marker}")
    return str(soup)

# --- 변환 ---
def build_fragment(md_text:str, use_math:bool, force_markers:bool) -> str:
    exts = ["extra", "toc", "mdx_truly_sane_lists"]
    if use_math: exts.append("pymdownx.arithmatex")
    cfg = {"pymdownx.arithmatex": {"generic": True}}

    cleaned = normalize_md(md_text)
    try:
        html = md.markdown(cleaned, extensions=exts, extension_configs=cfg)
    except Exception:
        html = md.markdown(cleaned, extensions=["extra","toc","sane_lists"]+(["pymdownx.arithmatex"] if use_math else []), extension_configs=cfg)

    # ⛏️ <p> 안의 '* ' / '1. ' 줄들을 실제 하위 리스트로 변환 (이제 1줄도 허용)
    html, _ = _patch_paragraph_bullets(html)

    # 🔗 인접 동일 리스트 병합 (<ul>…</ul><ul>…</ul> → 하나로)
    html = _merge_adjacent_lists(html)

    # 하위 목록 마커 모양 강제
    if force_markers:
        html = _apply_marker_styles(html)

    return html

# --- 핸들러 ---
def on_make(_):
    msg.value = ""
    raw = ta_md.value.strip()
    if not raw:
        msg.value = "<span style='color:#d00'>⚠️ 변환할 Markdown이 비어 있습니다.</span>"
        return
    if cb_toc.value and "[TOC]" not in raw:
        raw = "[TOC]\n\n" + raw
    ta_html.value = build_fragment(raw, cb_math.value, cb_force.value)
    msg.value = "<span style='color:#0a0'>✅ 본문 HTML을 생성했습니다. (헤더/스타일/스크립트 없음)</span>"

def on_copy(_):
    display(Javascript(f"navigator.clipboard.writeText({json.dumps(ta_html.value)})"))
    msg.value = "<span style='color:#0a0'>📋 복사 완료!</span>"

btn_make.on_click(on_make)
btn_copy.on_click(on_copy)

블로그 글 전체 목록 바로가기

마크다운 문서를 HTML로 변환하는 파이썬 코드(Feat.GPT-5)

블로그 소개

카테고리

태그 모아보기

블로그 글 전체 목록 바로가기

마크다운 문서를 HTML로 변환하는 파이썬 코드(Feat.GPT-5)

관련 글

블로그 소개

카테고리