mirror of https://github.com/mautrix/go.git
503 lines
14 KiB
Go
503 lines
14 KiB
Go
// Copyright (c) 2020 Tulir Asokan
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
package format
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"maunium.net/go/mautrix/event"
|
|
"maunium.net/go/mautrix/id"
|
|
)
|
|
|
|
type TagStack []string
|
|
|
|
func (ts TagStack) Index(tag string) int {
|
|
for i := len(ts) - 1; i >= 0; i-- {
|
|
if ts[i] == tag {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func (ts TagStack) Has(tag string) bool {
|
|
return ts.Index(tag) >= 0
|
|
}
|
|
|
|
type Context struct {
|
|
Ctx context.Context
|
|
ReturnData map[string]any
|
|
TagStack TagStack
|
|
|
|
PreserveWhitespace bool
|
|
}
|
|
|
|
func NewContext(ctx context.Context) Context {
|
|
return Context{
|
|
Ctx: ctx,
|
|
ReturnData: map[string]any{},
|
|
TagStack: make(TagStack, 0, 4),
|
|
}
|
|
}
|
|
|
|
func (ctx Context) WithTag(tag string) Context {
|
|
ctx.TagStack = append(ctx.TagStack, tag)
|
|
return ctx
|
|
}
|
|
|
|
func (ctx Context) WithWhitespace() Context {
|
|
ctx.PreserveWhitespace = true
|
|
return ctx
|
|
}
|
|
|
|
type TextConverter func(string, Context) string
|
|
type SpoilerConverter func(text, reason string, ctx Context) string
|
|
type LinkConverter func(text, href string, ctx Context) string
|
|
type ColorConverter func(text, fg, bg string, ctx Context) string
|
|
type CodeBlockConverter func(code, language string, ctx Context) string
|
|
type PillConverter func(displayname, mxid, eventID string, ctx Context) string
|
|
type ImageConverter func(src, alt, title, width, height string, isEmoji bool) string
|
|
|
|
const ContextKeyMentions = "_mentions"
|
|
|
|
func DefaultPillConverter(displayname, mxid, eventID string, ctx Context) string {
|
|
switch {
|
|
case len(mxid) == 0, mxid[0] == '@':
|
|
existingMentions, _ := ctx.ReturnData[ContextKeyMentions].([]id.UserID)
|
|
ctx.ReturnData[ContextKeyMentions] = append(existingMentions, id.UserID(mxid))
|
|
// User link, always just show the displayname
|
|
return displayname
|
|
case len(eventID) > 0:
|
|
// Event ID link, always just show the link
|
|
return fmt.Sprintf("https://matrix.to/#/%s/%s", mxid, eventID)
|
|
case mxid[0] == '!' && displayname == mxid:
|
|
// Room ID link with no separate display text, just show the link
|
|
return fmt.Sprintf("https://matrix.to/#/%s", mxid)
|
|
case mxid[0] == '#':
|
|
// Room alias link, just show the alias
|
|
return mxid
|
|
default:
|
|
// Other link (e.g. room ID link with display text), show text and link
|
|
return fmt.Sprintf("%s (https://matrix.to/#/%s)", displayname, mxid)
|
|
}
|
|
}
|
|
|
|
// HTMLParser is a somewhat customizable Matrix HTML parser.
|
|
type HTMLParser struct {
|
|
PillConverter PillConverter
|
|
TabsToSpaces int
|
|
Newline string
|
|
HorizontalLine string
|
|
BoldConverter TextConverter
|
|
ItalicConverter TextConverter
|
|
StrikethroughConverter TextConverter
|
|
UnderlineConverter TextConverter
|
|
MathConverter TextConverter
|
|
MathBlockConverter TextConverter
|
|
LinkConverter LinkConverter
|
|
SpoilerConverter SpoilerConverter
|
|
ColorConverter ColorConverter
|
|
MonospaceBlockConverter CodeBlockConverter
|
|
MonospaceConverter TextConverter
|
|
TextConverter TextConverter
|
|
ImageConverter ImageConverter
|
|
}
|
|
|
|
// TaggedString is a string that also contains a HTML tag.
|
|
type TaggedString struct {
|
|
string
|
|
tag string
|
|
}
|
|
|
|
func (parser *HTMLParser) maybeGetAttribute(node *html.Node, attribute string) (string, bool) {
|
|
for _, attr := range node.Attr {
|
|
if attr.Key == attribute {
|
|
return attr.Val, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
|
|
val, _ := parser.maybeGetAttribute(node, attribute)
|
|
return val
|
|
}
|
|
|
|
// Digits counts the number of digits (and the sign, if negative) in an integer.
|
|
func Digits(num int) int {
|
|
if num == 0 {
|
|
return 1
|
|
} else if num < 0 {
|
|
return Digits(-num) + 1
|
|
}
|
|
return int(math.Floor(math.Log10(float64(num))) + 1)
|
|
}
|
|
|
|
func (parser *HTMLParser) listToString(node *html.Node, ctx Context) string {
|
|
ordered := node.Data == "ol"
|
|
taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, ctx)
|
|
counter := 1
|
|
indentLength := 0
|
|
if ordered {
|
|
start := parser.getAttribute(node, "start")
|
|
if len(start) > 0 {
|
|
counter, _ = strconv.Atoi(start)
|
|
}
|
|
|
|
longestIndex := (counter - 1) + len(taggedChildren)
|
|
indentLength = Digits(longestIndex)
|
|
}
|
|
indent := strings.Repeat(" ", indentLength+2)
|
|
var children []string
|
|
for _, child := range taggedChildren {
|
|
if child.tag != "li" {
|
|
continue
|
|
}
|
|
var prefix string
|
|
// TODO make bullets and numbering configurable
|
|
if ordered {
|
|
indexPadding := indentLength - Digits(counter)
|
|
if indexPadding < 0 {
|
|
// This will happen on negative start indexes where longestIndex is usually wrong, otherwise shouldn't happen
|
|
indexPadding = 0
|
|
}
|
|
prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
|
|
} else {
|
|
prefix = "* "
|
|
}
|
|
str := prefix + child.string
|
|
counter++
|
|
parts := strings.Split(str, "\n")
|
|
for i, part := range parts[1:] {
|
|
parts[i+1] = indent + part
|
|
}
|
|
str = strings.Join(parts, "\n")
|
|
children = append(children, str)
|
|
}
|
|
return strings.Join(children, "\n")
|
|
}
|
|
|
|
func LongestSequence(in string, of rune) int {
|
|
currentSeq := 0
|
|
maxSeq := 0
|
|
for _, chr := range in {
|
|
if chr == of {
|
|
currentSeq++
|
|
} else {
|
|
if currentSeq > maxSeq {
|
|
maxSeq = currentSeq
|
|
}
|
|
currentSeq = 0
|
|
}
|
|
}
|
|
if currentSeq > maxSeq {
|
|
maxSeq = currentSeq
|
|
}
|
|
return maxSeq
|
|
}
|
|
|
|
func (parser *HTMLParser) basicFormatToString(node *html.Node, ctx Context) string {
|
|
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
|
switch node.Data {
|
|
case "b", "strong":
|
|
if parser.BoldConverter != nil {
|
|
return parser.BoldConverter(str, ctx)
|
|
}
|
|
return fmt.Sprintf("**%s**", str)
|
|
case "i", "em":
|
|
if parser.ItalicConverter != nil {
|
|
return parser.ItalicConverter(str, ctx)
|
|
}
|
|
return fmt.Sprintf("_%s_", str)
|
|
case "s", "del", "strike":
|
|
if parser.StrikethroughConverter != nil {
|
|
return parser.StrikethroughConverter(str, ctx)
|
|
}
|
|
return fmt.Sprintf("~~%s~~", str)
|
|
case "u", "ins":
|
|
if parser.UnderlineConverter != nil {
|
|
return parser.UnderlineConverter(str, ctx)
|
|
}
|
|
case "tt", "code":
|
|
if parser.MonospaceConverter != nil {
|
|
return parser.MonospaceConverter(str, ctx)
|
|
}
|
|
surround := strings.Repeat("`", LongestSequence(str, '`')+1)
|
|
return fmt.Sprintf("%s%s%s", surround, str, surround)
|
|
}
|
|
return str
|
|
}
|
|
|
|
func (parser *HTMLParser) spanToString(node *html.Node, ctx Context) string {
|
|
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
|
if node.Data == "span" || node.Data == "div" {
|
|
math, _ := parser.maybeGetAttribute(node, "data-mx-maths")
|
|
if math != "" && parser.MathConverter != nil {
|
|
if node.Data == "div" && parser.MathBlockConverter != nil {
|
|
str = parser.MathBlockConverter(math, ctx)
|
|
} else {
|
|
str = parser.MathConverter(math, ctx)
|
|
}
|
|
}
|
|
}
|
|
if node.Data == "span" {
|
|
reason, isSpoiler := parser.maybeGetAttribute(node, "data-mx-spoiler")
|
|
if isSpoiler {
|
|
if parser.SpoilerConverter != nil {
|
|
str = parser.SpoilerConverter(str, reason, ctx)
|
|
} else if len(reason) > 0 {
|
|
str = fmt.Sprintf("||%s|%s||", reason, str)
|
|
} else {
|
|
str = fmt.Sprintf("||%s||", str)
|
|
}
|
|
}
|
|
}
|
|
if parser.ColorConverter != nil {
|
|
fg := parser.getAttribute(node, "data-mx-color")
|
|
if len(fg) == 0 && node.Data == "font" {
|
|
fg = parser.getAttribute(node, "color")
|
|
}
|
|
bg := parser.getAttribute(node, "data-mx-bg-color")
|
|
if len(bg) > 0 || len(fg) > 0 {
|
|
str = parser.ColorConverter(str, fg, bg, ctx)
|
|
}
|
|
}
|
|
return str
|
|
}
|
|
|
|
func (parser *HTMLParser) headerToString(node *html.Node, ctx Context) string {
|
|
children := parser.nodeToStrings(node.FirstChild, ctx)
|
|
length := int(node.Data[1] - '0')
|
|
prefix := strings.Repeat("#", length) + " "
|
|
return prefix + strings.Join(children, "")
|
|
}
|
|
|
|
func (parser *HTMLParser) blockquoteToString(node *html.Node, ctx Context) string {
|
|
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
|
childrenArr := strings.Split(strings.TrimSpace(str), "\n")
|
|
// TODO make blockquote prefix configurable
|
|
for index, child := range childrenArr {
|
|
childrenArr[index] = "> " + child
|
|
}
|
|
return strings.Join(childrenArr, "\n")
|
|
}
|
|
|
|
func (parser *HTMLParser) linkToString(node *html.Node, ctx Context) string {
|
|
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
|
href := parser.getAttribute(node, "href")
|
|
if len(href) == 0 {
|
|
return str
|
|
}
|
|
if parser.PillConverter != nil {
|
|
parsedMatrix, err := id.ParseMatrixURIOrMatrixToURL(href)
|
|
if err == nil && parsedMatrix != nil {
|
|
return parser.PillConverter(str, parsedMatrix.PrimaryIdentifier(), parsedMatrix.SecondaryIdentifier(), ctx)
|
|
}
|
|
}
|
|
if parser.LinkConverter != nil {
|
|
return parser.LinkConverter(str, href, ctx)
|
|
} else if str == href {
|
|
return str
|
|
}
|
|
return fmt.Sprintf("%s (%s)", str, href)
|
|
}
|
|
|
|
func (parser *HTMLParser) imgToString(node *html.Node, ctx Context) string {
|
|
src := parser.getAttribute(node, "src")
|
|
alt := parser.getAttribute(node, "alt")
|
|
title := parser.getAttribute(node, "title")
|
|
width := parser.getAttribute(node, "width")
|
|
height := parser.getAttribute(node, "height")
|
|
_, isEmoji := parser.maybeGetAttribute(node, "data-mx-emoticon")
|
|
if parser.ImageConverter != nil {
|
|
return parser.ImageConverter(src, alt, title, width, height, isEmoji)
|
|
}
|
|
return alt
|
|
}
|
|
|
|
func (parser *HTMLParser) tagToString(node *html.Node, ctx Context) string {
|
|
ctx = ctx.WithTag(node.Data)
|
|
switch node.Data {
|
|
case "blockquote":
|
|
return parser.blockquoteToString(node, ctx)
|
|
case "ol", "ul":
|
|
return parser.listToString(node, ctx)
|
|
case "h1", "h2", "h3", "h4", "h5", "h6":
|
|
return parser.headerToString(node, ctx)
|
|
case "br":
|
|
return parser.Newline
|
|
case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "tt", "code":
|
|
return parser.basicFormatToString(node, ctx)
|
|
case "span", "font":
|
|
return parser.spanToString(node, ctx)
|
|
case "a":
|
|
return parser.linkToString(node, ctx)
|
|
case "p":
|
|
return parser.nodeToTagAwareString(node.FirstChild, ctx)
|
|
case "img":
|
|
return parser.imgToString(node, ctx)
|
|
case "hr":
|
|
return parser.HorizontalLine
|
|
case "pre":
|
|
var preStr, language string
|
|
if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
|
|
class := parser.getAttribute(node.FirstChild, "class")
|
|
if strings.HasPrefix(class, "language-") {
|
|
language = class[len("language-"):]
|
|
}
|
|
preStr = parser.nodeToString(node.FirstChild.FirstChild, ctx.WithWhitespace())
|
|
} else {
|
|
preStr = parser.nodeToString(node.FirstChild, ctx.WithWhitespace())
|
|
}
|
|
if parser.MonospaceBlockConverter != nil {
|
|
return parser.MonospaceBlockConverter(preStr, language, ctx)
|
|
}
|
|
if len(preStr) == 0 || preStr[len(preStr)-1] != '\n' {
|
|
preStr += "\n"
|
|
}
|
|
return fmt.Sprintf("```%s\n%s```", language, preStr)
|
|
default:
|
|
return parser.nodeToTagAwareString(node.FirstChild, ctx)
|
|
}
|
|
}
|
|
|
|
func (parser *HTMLParser) singleNodeToString(node *html.Node, ctx Context) TaggedString {
|
|
switch node.Type {
|
|
case html.TextNode:
|
|
if !ctx.PreserveWhitespace {
|
|
node.Data = strings.Replace(node.Data, "\n", "", -1)
|
|
}
|
|
if parser.TextConverter != nil {
|
|
node.Data = parser.TextConverter(node.Data, ctx)
|
|
}
|
|
return TaggedString{node.Data, "text"}
|
|
case html.ElementNode:
|
|
return TaggedString{parser.tagToString(node, ctx), node.Data}
|
|
case html.DocumentNode:
|
|
return TaggedString{parser.nodeToTagAwareString(node.FirstChild, ctx), "html"}
|
|
default:
|
|
return TaggedString{"", "unknown"}
|
|
}
|
|
}
|
|
|
|
func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, ctx Context) (strs []TaggedString) {
|
|
for ; node != nil; node = node.NextSibling {
|
|
strs = append(strs, parser.singleNodeToString(node, ctx))
|
|
}
|
|
return
|
|
}
|
|
|
|
var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
|
|
|
|
func (parser *HTMLParser) isBlockTag(tag string) bool {
|
|
for _, blockTag := range BlockTags {
|
|
if tag == blockTag {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, ctx Context) string {
|
|
strs := parser.nodeToTaggedStrings(node, ctx)
|
|
var output strings.Builder
|
|
for _, str := range strs {
|
|
tstr := str.string
|
|
if parser.isBlockTag(str.tag) {
|
|
tstr = fmt.Sprintf("\n%s\n", tstr)
|
|
}
|
|
output.WriteString(tstr)
|
|
}
|
|
return strings.TrimSpace(output.String())
|
|
}
|
|
|
|
func (parser *HTMLParser) nodeToStrings(node *html.Node, ctx Context) (strs []string) {
|
|
for ; node != nil; node = node.NextSibling {
|
|
strs = append(strs, parser.singleNodeToString(node, ctx).string)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (parser *HTMLParser) nodeToString(node *html.Node, ctx Context) string {
|
|
return strings.Join(parser.nodeToStrings(node, ctx), "")
|
|
}
|
|
|
|
// Parse converts Matrix HTML into text using the settings in this parser.
|
|
func (parser *HTMLParser) Parse(htmlData string, ctx Context) string {
|
|
if parser.TabsToSpaces >= 0 {
|
|
htmlData = strings.Replace(htmlData, "\t", strings.Repeat(" ", parser.TabsToSpaces), -1)
|
|
}
|
|
node, _ := html.Parse(strings.NewReader(htmlData))
|
|
return parser.nodeToTagAwareString(node, ctx)
|
|
}
|
|
|
|
var TextHTMLParser = &HTMLParser{
|
|
TabsToSpaces: 4,
|
|
Newline: "\n",
|
|
HorizontalLine: "\n---\n",
|
|
PillConverter: DefaultPillConverter,
|
|
}
|
|
|
|
var MarkdownHTMLParser = &HTMLParser{
|
|
TabsToSpaces: 4,
|
|
Newline: "\n",
|
|
HorizontalLine: "\n---\n",
|
|
PillConverter: DefaultPillConverter,
|
|
LinkConverter: func(text, href string, ctx Context) string {
|
|
if text == href {
|
|
return text
|
|
}
|
|
return fmt.Sprintf("[%s](%s)", text, href)
|
|
},
|
|
MathConverter: func(s string, c Context) string {
|
|
return fmt.Sprintf("$%s$", s)
|
|
},
|
|
MathBlockConverter: func(s string, c Context) string {
|
|
return fmt.Sprintf("$$\n%s\n$$", s)
|
|
},
|
|
UnderlineConverter: func(s string, c Context) string {
|
|
return fmt.Sprintf("<u>%s</u>", s)
|
|
},
|
|
}
|
|
|
|
// HTMLToText converts Matrix HTML into text with the default settings.
|
|
func HTMLToText(html string) string {
|
|
return (&HTMLParser{
|
|
TabsToSpaces: 4,
|
|
Newline: "\n",
|
|
HorizontalLine: "\n---\n",
|
|
PillConverter: DefaultPillConverter,
|
|
}).Parse(html, NewContext(context.TODO()))
|
|
}
|
|
|
|
func HTMLToMarkdownFull(parser *HTMLParser, html string) (parsed string, mentions *event.Mentions) {
|
|
if parser == nil {
|
|
parser = MarkdownHTMLParser
|
|
}
|
|
ctx := NewContext(context.TODO())
|
|
parsed = parser.Parse(html, ctx)
|
|
mentionList, _ := ctx.ReturnData[ContextKeyMentions].([]id.UserID)
|
|
mentions = &event.Mentions{
|
|
UserIDs: mentionList,
|
|
}
|
|
return
|
|
}
|
|
|
|
// HTMLToMarkdown converts Matrix HTML into markdown with the default settings.
|
|
//
|
|
// Currently, the only difference to HTMLToText is how links are formatted.
|
|
func HTMLToMarkdown(html string) string {
|
|
parsed, _ := HTMLToMarkdownFull(nil, html)
|
|
return parsed
|
|
}
|