From ee0e72f246446e9e07803b8e9db1503b23160ce2 Mon Sep 17 00:00:00 2001 From: Antoni Sawicki Date: Sat, 22 Jun 2024 11:53:20 -0700 Subject: [PATCH] use plugins for markdown --- go.mod | 1 + go.sum | 3 +++ wrp.go | 16 ++++++++++++---- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 228c73b..bab96da 100644 --- a/go.mod +++ b/go.mod @@ -22,4 +22,5 @@ require ( github.com/mailru/easyjson v0.7.7 // indirect golang.org/x/net v0.25.0 // indirect golang.org/x/sys v0.20.0 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect ) diff --git a/go.sum b/go.sum index 77d3705..22adfd0 100644 --- a/go.sum +++ b/go.sum @@ -26,8 +26,10 @@ github.com/gomarkdown/markdown v0.0.0-20240419095408-642f0ee99ae2 h1:yEt5djSYb4i github.com/gomarkdown/markdown v0.0.0-20240419095408-642f0ee99ae2/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= @@ -106,6 +108,7 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= diff --git a/wrp.go b/wrp.go index 7ef1120..69d2914 100644 --- a/wrp.go +++ b/wrp.go @@ -35,6 +35,8 @@ import ( "text/template" "time" + "github.com/JohannesKaufmann/html-to-markdown/plugin" + h2m "github.com/JohannesKaufmann/html-to-markdown" "github.com/MaxHalford/halfgone" "github.com/chromedp/cdproto/css" @@ -429,14 +431,19 @@ func asciify(s []byte) []byte { } func (rq *wrpReq) toMarkdown() { - log.Printf("Processing Markdown conversion for %v", rq.url) - h := h2m.NewConverter(h2m.DomainFromURL(rq.url), true, nil) - md, err := h.ConvertURL(rq.url) + log.Printf("Processing Markdown conversion request for %v", rq.url) + // TODO: bug - DomainFromURL always prefixes with http:// instead of https + // this causes issues on some websites, write a smarter DomainFromURL + c := h2m.NewConverter(h2m.DomainFromURL(rq.url), true, nil) + c.Use(plugin.GitHubFlavored()) + // We could alternatively get inner html from chromedp + md, err := c.ConvertURL(rq.url) if err != nil { http.Error(rq.w, err.Error(), http.StatusInternalServerError) return } - p := parser.New() + log.Printf("Got %v bytes md from %v", len(md), rq.url) + p := parser.NewWithExtensions(parser.CommonExtensions) d := p.Parse([]byte(md)) ast.WalkFunc(d, func(node ast.Node, entering bool) ast.WalkStatus { if link, ok := node.(*ast.Link); ok && entering { @@ -449,6 +456,7 @@ func (rq *wrpReq) toMarkdown() { }) r := html.NewRenderer(html.RendererOptions{}) ht := markdown.Render(d, r) + log.Printf("Rendered %v bytes of html for %v", len(ht), rq.url) // TODO: add https://github.com/microcosm-cc/bluemonday rq.printHTML(printParams{ text: string(asciify(ht)),