#!/usr/bin/awk -f # # by: Jesus Galan (yiyus) 2009 # # Usage: md2html.awk file.md > file.html # See: http://4l77.com/src/md2html.awk function eschtml(t) { gsub("&", "\\&", t); gsub("<", "\\<", t); return t; } function oprint(t){ if(nr == 0) print t; else otext = otext "\n" t; } function subref(id){ for(; nr > 0 && sub("<<" id, ref[id], otext); nr--); if(nr == 0 && otext) { print otext; otext = ""; } } function nextil(t) { if(!match(t, /[`<&\[*_\\-]|(\!\[)/)) return t; t1 = substr(t, 1, RSTART - 1); tag = substr(t, RSTART, RLENGTH); t2 = substr(t, RSTART + RLENGTH); if(ilcode && tag != "`") return eschtml(t1 tag) nextil(t2); # Backslash escaping if(tag == "\\"){ if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){ tag = substr(t2, 1, 1); t2 = substr(t2, 2); } return t1 tag nextil(t2); } # Dashes if(tag == "-"){ if(sub(/^-/, "", t2)) tag = "—"; return t1 tag nextil(t2); } # Inline Code if(tag == "`"){ if(sub(/^`/, "", t2)){ if(!match(t2, /``/)) return t1 "”" nextil(t2); ilcode2 = !ilcode2; } else if(ilcode2) return t1 tag nextil(t2); tag = ""; if(ilcode){ t1 = eschtml(t1); tag = ""; } ilcode = !ilcode; return t1 tag nextil(t2); } if(tag == "<"){ # Autolinks if(match(t2, /^[^ ]+[\.@][^ ]+>/)){ url = eschtml(substr(t2, 1, RLENGTH - 1)); t2 = substr(t2, RLENGTH + 1); linktext = url; if(match(url, /@/) && !match(url, /^mailto:/)) url = "mailto:" url; return t1 "" linktext "" nextil(t2); } # Html tags if(match(t2, /^[A-Za-z\/!][^>]*>/)){ tag = tag substr(t2, RSTART, RLENGTH); t2 = substr(t2, RLENGTH + 1); return t1 tag nextil(t2); } return t1 "<" nextil(t2); } # Html special entities if(tag == "&"){ if(match(t2, /^#?[A-Za-z0-9]+;/)){ tag = tag substr(t2, RSTART, RLENGTH); t2 = substr(t2, RLENGTH + 1); return t1 tag nextil(t2); } return t1 "&" nextil(t2); } # Images if(tag == "!["){ if(!match(t2, /(\[.*\])|(\(.*\))/)) return t1 tag nextil(t2); match(t2, /^[^\]]*/); alt = substr(t2, 1, RLENGTH); t2 = substr(t2, RLENGTH + 2); if(match(t2, /^\(/)){ # Inline sub(/^\(/, "", t2); match(t2, /^[^\)]+/); url = eschtml(substr(t2, 1, RLENGTH)); t2 = substr(t2, RLENGTH + 2); title = ""; if(match(url, /[ ]+\".*\"[ ]*$/)) { title = substr(url, RSTART, RLENGTH); url = substr(url, 1, RSTART - 1); match(title, /\".*\"/); title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\""; } if(match(url, /^<.*>$/)) url = substr(url, 2, RLENGTH - 2); return t1 "\""" nextil(t2); } else{ # Referenced sub(/^ ?\[/, "", t2); id = alt; if(match(t2, /^[^\]]+/)) id = substr(t2, 1, RLENGTH); t2 = substr(t2, RLENGTH + 2); if(ref[id]) r = ref[id]; else{ r = "<<" id; nr++; } return t1 "\""" nextil(t2); } } # Links if(tag == "["){ if(!match(t2, /(\[.*\])|(\(.*\))/)) return t1 tag nextil(t2); match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/); linktext = substr(t2, 1, RLENGTH); t2 = substr(t2, RLENGTH + 2); if(match(t2, /^\(/)){ # Inline match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/); url = substr(t2, 2, RLENGTH - 1); pt2 = substr(t2, RLENGTH + 2); title = ""; if(match(url, /[ ]+\".*\"[ ]*$/)) { title = substr(url, RSTART, RLENGTH); url = substr(url, 1, RSTART - 1); match(title, /\".*\"/); title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\""; } if(match(url, /^<.*>$/)) url = substr(url, 2, RLENGTH - 2); url = eschtml(url); return t1 "" nextil(linktext) "" nextil(pt2); } else{ # Referenced sub(/^ ?\[/, "", t2); id = linktext; if(match(t2, /^[^\]]+/)) id = substr(t2, 1, RLENGTH); t2 = substr(t2, RLENGTH + 2); if(ref[id]) r = ref[id]; else{ r = "<<" id; nr++; } pt2 = t2; return t1 "" nextil(linktext) "" nextil(pt2); } } # Emphasis if(match(tag, /[*_]/)){ ntag = tag; if(sub("^" tag, "", t2)){ if(stag[ns] == tag && match(t2, "^" tag)) t2 = tag t2; else ntag = tag tag } n = length(ntag); tag = (n == 2) ? "strong" : "em"; if(match(t1, / $/) && match(t2, /^ /)) return t1 tag nextil(t2); if(stag[ns] == ntag){ tag = "/" tag; ns--; } else stag[++ns] = ntag; tag = "<" tag ">"; return t1 tag nextil(t2); } } function inline(t) { ilcode = 0; ilcode2 = 0; ns = 0; return nextil(t); } function printp(tag) { if(!match(text, /^[ ]*$/)){ text = inline(text); if(tag != "") oprint("<" tag ">" text ""); else oprint(text); } text = ""; } BEGIN { blank = 0; code = 0; hr = 0; html = 0; nl = 0; nr = 0; otext = ""; text = ""; par = "p"; } # References !code && /^ *\[[^\]]*\]:[ ]+/ { sub(/^ *\[/, ""); match($0, /\]/); id = substr($0, 1, RSTART - 1); sub(id "\\]:[ ]+", ""); title = ""; if(match($0, /\".*\"$/)) title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2); sub(/[ ]+\".*\"$/, ""); url = eschtml($0); ref[id] = url title; subref(id); next; } # html !html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ { if(code) oprint(""); for(; !text && block[nl] == "blockquote"; nl--) oprint(""); match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/); htag = substr($0, 2, RLENGTH - 1); if(!match($0, "(<\\/" htag ">)|((^
$)")) html = 1; if(html && match($0, /^
$/ || (hr && />$/)) { html = 0; hr = 0; oprint($0); next; } html { oprint($0); next; } # List and quote blocks # Remove indentation { for(nnl = 0; nnl < nl; nnl++) if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \ (block[nnl + 1] == "blockquote" && !sub(/^> ?/, ""))) break; } nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; } # Quote blocks { while(sub(/^> /, "")) nblock[++nnl] = "blockquote"; } # Horizontal rules { hr = 0; } (blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ { if(code){ oprint(""); code = 0; } blank = 0; nnl = 0; hr = 1; } # List items block[nl] ~ /[ou]l/ && /^$/ { blank = 1; next; } { newli = 0; } !hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ { sub(/^ ? ? ?[*+-]( +| )/, ""); nnl++; nblock[nnl] = "ul"; newli = 1; } (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ { sub(/^ ? ? ?([0-9]+\.)+( +| )/, ""); nnl++; nblock[nnl] = "ol"; newli = 1; } newli { if(blank && nnl == nl && !par) par = "p"; blank = 0; printp(par); if(nnl == nl && block[nl] == nblock[nl]) oprint("
  • "); } blank && ! /^$/ { if(match(block[nnl], /[ou]l/) && !par) par = "p"; printp(par); par = "p"; blank = 0; } # Close old blocks and open new ones nnl != nl || nblock[nl] != block[nl] { if(code){ oprint(""); code = 0; } printp(par); b = (nnl > nl) ? nblock[nnl] : block[nnl]; par = (match(b, /[ou]l/)) ? "" : "p"; } nnl < nl || (nnl == nl && nblock[nl] != block[nl]) { for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){ if(match(block[nl], /[ou]l/)) oprint("
  • "); oprint(""); } } nnl > nl { for(; nl < nnl; nl++){ block[nl + 1] = nblock[nl + 1]; oprint("<" block[nl + 1] ">"); if(match(block[nl + 1], /[ou]l/)) oprint("
  • "); } } hr { oprint("
    "); next; } # Code blocks code && /^$/ { if(blanK) oprint(""); blank = 1; next; } !text && sub(/^( | )/, "") { if(blanK) oprint(""); blank = 0; if(!code) oprint("
    ");
    	code = 1;
    	$0 = eschtml($0);
    	oprint($0);
    	next;
    }
    code {
    	oprint("
    "); code = 0; } # Setex-style Headers text && /^=+$/ {printp("h1"); next;} text && /^-+$/ {printp("h2"); next;} # Atx-Style headers /^#+/ && (!newli || par=="p" || /^##/) { for(n = 0; n < 6 && sub(/^# */, ""); n++) sub(/#$/, ""); par = "h" n; } # Paragraph /^$/ { printp(par); par = "p"; next; } # Add text { text = (text ? text " " : "") $0; } END { if(code){ oprint(""); code = 0; } printp(par); for(; nl > 0; nl--){ if(match(block[nl], /[ou]l/)) oprint("
  • "); oprint(""); } gsub(/<<[^\"]*/, "", otext); print(otext); }