podgrab/internal/sanitize/sanitize.go

391 lines
10 KiB
Go

// Package sanitize provides functions for sanitizing text.
package sanitize
import (
"bytes"
"html"
"html/template"
"io"
"path"
"regexp"
"strings"
parser "golang.org/x/net/html"
)
var (
ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
)
// HTMLAllowing sanitizes html, allowing some tags.
// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
func HTMLAllowing(s string, args ...[]string) (string, error) {
allowedTags := defaultTags
if len(args) > 0 {
allowedTags = args[0]
}
allowedAttributes := defaultAttributes
if len(args) > 1 {
allowedAttributes = args[1]
}
// Parse the html
tokenizer := parser.NewTokenizer(strings.NewReader(s))
buffer := bytes.NewBufferString("")
ignore := ""
for {
tokenType := tokenizer.Next()
token := tokenizer.Token()
switch tokenType {
case parser.ErrorToken:
err := tokenizer.Err()
if err == io.EOF {
return buffer.String(), nil
}
return "", err
case parser.StartTagToken:
if len(ignore) == 0 && includes(allowedTags, token.Data) {
token.Attr = cleanAttributes(token.Attr, allowedAttributes)
buffer.WriteString(token.String())
} else if includes(ignoreTags, token.Data) {
ignore = token.Data
}
case parser.SelfClosingTagToken:
if len(ignore) == 0 && includes(allowedTags, token.Data) {
token.Attr = cleanAttributes(token.Attr, allowedAttributes)
buffer.WriteString(token.String())
} else if token.Data == ignore {
ignore = ""
}
case parser.EndTagToken:
if len(ignore) == 0 && includes(allowedTags, token.Data) {
token.Attr = []parser.Attribute{}
buffer.WriteString(token.String())
} else if token.Data == ignore {
ignore = ""
}
case parser.TextToken:
// We allow text content through, unless ignoring this entire tag and its contents (including other tags)
if ignore == "" {
buffer.WriteString(token.String())
}
case parser.CommentToken:
// We ignore comments by default
case parser.DoctypeToken:
// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
default:
// We ignore unknown token types by default
}
}
}
// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
func HTML(s string) (output string) {
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
output = s
} else {
// First remove line breaks etc as these have no meaning outside html tags (except pre)
// this means pre sections will lose formatting... but will result in less unintentional paras.
s = strings.Replace(s, "\n", "", -1)
// Then replace line breaks with newlines, to preserve that formatting
s = strings.Replace(s, "</p>", "\n", -1)
s = strings.Replace(s, "<br>", "\n", -1)
s = strings.Replace(s, "</br>", "\n", -1)
s = strings.Replace(s, "<br/>", "\n", -1)
s = strings.Replace(s, "<br />", "\n", -1)
// Walk through the string removing all tags
b := bytes.NewBufferString("")
inTag := false
for _, r := range s {
switch r {
case '<':
inTag = true
case '>':
inTag = false
default:
if !inTag {
b.WriteRune(r)
}
}
}
output = b.String()
}
// Remove a few common harmless entities, to arrive at something more like plain text
output = strings.Replace(output, "&#8216;", "'", -1)
output = strings.Replace(output, "&#8217;", "'", -1)
output = strings.Replace(output, "&#8220;", "\"", -1)
output = strings.Replace(output, "&#8221;", "\"", -1)
output = strings.Replace(output, "&nbsp;", " ", -1)
output = strings.Replace(output, "&quot;", "\"", -1)
output = strings.Replace(output, "&apos;", "'", -1)
// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
output = html.UnescapeString(output)
// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
output = template.HTMLEscapeString(output)
// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
output = strings.Replace(output, "&#34;", "\"", -1)
output = strings.Replace(output, "&#39;", "'", -1)
output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after
output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after
return output
}
// We are very restrictive as this is intended for ascii url slugs
var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
// Path makes a string safe to use as a URL path,
// removing accents and replacing separators with -.
// The path may still start at / and is not intended
// for use as a file system path without prefix.
func Path(s string) string {
// Start with lowercase string
filePath := strings.ToLower(s)
filePath = strings.Replace(filePath, "..", "", -1)
filePath = path.Clean(filePath)
// Remove illegal characters for paths, flattening accents
// and replacing some common separators with -
filePath = cleanString(filePath, illegalPath)
// NB this may be of length 0, caller must check
return filePath
}
// Remove all other unrecognised characters apart from
var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
func Name(s string) string {
// Start with lowercase string
fileName := s
fileName = baseNameSeparators.ReplaceAllString(fileName, "-")
fileName = path.Clean(path.Base(fileName))
// Remove illegal characters for names, replacing some common separators with -
fileName = cleanString(fileName, illegalName)
// NB this may be of length 0, caller must check
return fileName
}
// Replace these separators with -
var baseNameSeparators = regexp.MustCompile(`[./]`)
// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
// No attempt is made to normalise a path or normalise case.
func BaseName(s string) string {
// Replace certain joining characters with a dash
baseName := baseNameSeparators.ReplaceAllString(s, "-")
// Remove illegal characters for names, replacing some common separators with -
baseName = cleanString(baseName, illegalName)
// NB this may be of length 0, caller must check
return baseName
}
// A very limited list of transliterations to catch common european names translated to urls.
// This set could be expanded with at least caps and many more characters.
var transliterations = map[rune]string{
'À': "A",
'Á': "A",
'Â': "A",
'Ã': "A",
'Ä': "A",
'Å': "AA",
'Æ': "AE",
'Ç': "C",
'È': "E",
'É': "E",
'Ê': "E",
'Ë': "E",
'Ì': "I",
'Í': "I",
'Î': "I",
'Ï': "I",
'Ð': "D",
'Ł': "L",
'Ñ': "N",
'Ò': "O",
'Ó': "O",
'Ô': "O",
'Õ': "O",
'Ö': "OE",
'Ø': "OE",
'Œ': "OE",
'Ù': "U",
'Ú': "U",
'Ü': "UE",
'Û': "U",
'Ý': "Y",
'Þ': "TH",
'ẞ': "SS",
'à': "a",
'á': "a",
'â': "a",
'ã': "a",
'ä': "ae",
'å': "aa",
'æ': "ae",
'ç': "c",
'è': "e",
'é': "e",
'ê': "e",
'ë': "e",
'ì': "i",
'í': "i",
'î': "i",
'ï': "i",
'ð': "d",
'ł': "l",
'ñ': "n",
'ń': "n",
'ò': "o",
'ó': "o",
'ô': "o",
'õ': "o",
'ō': "o",
'ö': "oe",
'ø': "oe",
'œ': "oe",
'ś': "s",
'ù': "u",
'ú': "u",
'û': "u",
'ū': "u",
'ü': "ue",
'ý': "y",
'ÿ': "y",
'ż': "z",
'þ': "th",
'ß': "ss",
}
// Accents replaces a set of accented characters with ascii equivalents.
func Accents(s string) string {
// Replace some common accent characters
b := bytes.NewBufferString("")
for _, c := range s {
// Check transliterations first
if val, ok := transliterations[c]; ok {
b.WriteString(val)
} else {
b.WriteRune(c)
}
}
return b.String()
}
var (
// If the attribute contains data: or javascript: anywhere, ignore it
// we don't allow this in attributes as it is so frequently used for xss
// NB we allow spaces in the value, and lowercase.
illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
// We are far more restrictive with href attributes.
legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
)
// cleanAttributes returns an array of attributes after removing malicious ones.
func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
if len(a) == 0 {
return a
}
var cleaned []parser.Attribute
for _, attr := range a {
if includes(allowed, attr.Key) {
val := strings.ToLower(attr.Val)
// Check for illegal attribute values
if illegalAttr.FindString(val) != "" {
attr.Val = ""
}
// Check for legal href values - / mailto:// http:// or https://
if attr.Key == "href" {
if legalHrefAttr.FindString(val) == "" {
attr.Val = ""
}
}
// If we still have an attribute, append it to the array
if attr.Val != "" {
cleaned = append(cleaned, attr)
}
}
}
return cleaned
}
// A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
var (
separators = regexp.MustCompile(`[!&_="#|+?:]`)
dashes = regexp.MustCompile(`[\-]+`)
)
// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
func cleanString(s string, r *regexp.Regexp) string {
// Remove any trailing space to avoid ending on -
s = strings.Trim(s, " ")
// Flatten accents first so that if we remove non-ascii we still get a legible name
s = Accents(s)
// Replace certain joining characters with a dash
s = separators.ReplaceAllString(s, "-")
// Remove all other unrecognised characters - NB we do allow any printable characters
//s = r.ReplaceAllString(s, "")
// Remove any multiple dashes caused by replacements above
s = dashes.ReplaceAllString(s, "-")
return s
}
// includes checks for inclusion of a string in a []string.
func includes(a []string, s string) bool {
for _, as := range a {
if as == s {
return true
}
}
return false
}