diff options
author | zeripath | 2019-04-28 00:36:12 +0100 |
---|---|---|
committer | techknowlogick | 2019-04-27 19:36:12 -0400 |
commit | 21fb7917473773a5b9c9d52fbcdbfbcc9fa94a22 (patch) | |
tree | 2cf833a52a4bce3e2ac83afc8ecfafb9678a21d6 | |
parent | 8b3aad940e915b9db11deb0f06d9e5338cfe3fdd (diff) |
Detect encoding and BOM in content (#6727) (#6765)
Detect and remove a decoded BOM when showing content.
Restore the previous encoding and BOM when updating content.
On error keep as UTF-8 encoding.
Signed-off-by: Andrew Thornton <art27@cantab.net>
-rw-r--r-- | modules/base/tool.go | 12 | ||||
-rw-r--r-- | modules/templates/helper.go | 14 | ||||
-rw-r--r-- | modules/uploader/update.go | 108 |
3 files changed, 127 insertions, 7 deletions
diff --git a/modules/base/tool.go b/modules/base/tool.go index b069e5fae..4dd7ba59b 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -5,6 +5,7 @@ package base import ( + "bytes" "crypto/md5" "crypto/rand" "crypto/sha1" @@ -32,6 +33,9 @@ import ( "github.com/gogits/chardet" ) +// UTF8BOM is the utf-8 byte-order marker +var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} + // EncodeMD5 encodes string to md5 hex value. func EncodeMD5(str string) string { m := md5.New() @@ -87,6 +91,14 @@ func DetectEncoding(content []byte) (string, error) { return result.Charset, err } +// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte +func RemoveBOMIfPresent(content []byte) []byte { + if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { + return content[3:] + } + return content +} + // BasicAuthDecode decode basic auth string func BasicAuthDecode(encoded string) (string, string, error) { s, err := base64.StdEncoding.DecodeString(encoded) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index ce077d1a9..84952d14e 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) { if err != nil { return "", err } else if charsetLabel == "UTF-8" { - return string(content), nil + return string(base.RemoveBOMIfPresent(content)), nil } encoding, _ := charset.Lookup(charsetLabel) @@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) { // If there is an error, we concatenate the nicely decoded part and the // original left over. This way we won't lose data. - result, n, err := transform.String(encoding.NewDecoder(), string(content)) + result, n, err := transform.Bytes(encoding.NewDecoder(), content) if err != nil { - result = result + string(content[n:]) + result = append(result, content[n:]...) } - return result, err + result = base.RemoveBOMIfPresent(result) + + return string(result), err } // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible func ToUTF8WithFallback(content []byte) []byte { charsetLabel, err := base.DetectEncoding(content) if err != nil || charsetLabel == "UTF-8" { - return content + return base.RemoveBOMIfPresent(content) } encoding, _ := charset.Lookup(charsetLabel) @@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte { return append(result, content[n:]...) } - return result + return base.RemoveBOMIfPresent(result) } // ToUTF8 converts content to UTF8 encoding and ignore error diff --git a/modules/uploader/update.go b/modules/uploader/update.go index 08caf11ee..fe85cbf47 100644 --- a/modules/uploader/update.go +++ b/modules/uploader/update.go @@ -5,15 +5,85 @@ package uploader import ( + "bytes" "fmt" "strings" + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" + "code.gitea.io/git" "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/base" "code.gitea.io/gitea/modules/lfs" + "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" ) +func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) { + reader, err := entry.Blob().DataAsync() + if err != nil { + // return default + return "UTF-8", false + } + defer reader.Close() + buf := make([]byte, 1024) + n, err := reader.Read(buf) + if err != nil { + // return default + return "UTF-8", false + } + buf = buf[:n] + + if setting.LFS.StartServer { + meta := lfs.IsPointerFile(&buf) + if meta != nil { + meta, err = repo.GetLFSMetaObjectByOid(meta.Oid) + if err != nil && err != models.ErrLFSObjectNotExist { + // return default + return "UTF-8", false + } + } + if meta != nil { + dataRc, err := lfs.ReadMetaObject(meta) + if err != nil { + // return default + return "UTF-8", false + } + defer dataRc.Close() + buf = make([]byte, 1024) + n, err = dataRc.Read(buf) + if err != nil { + // return default + return "UTF-8", false + } + buf = buf[:n] + } + + } + + encoding, err := base.DetectEncoding(buf) + if err != nil { + // just default to utf-8 and no bom + return "UTF-8", false + } + if encoding == "UTF-8" { + return encoding, bytes.Equal(buf[0:3], base.UTF8BOM) + } + charsetEncoding, _ := charset.Lookup(encoding) + if charsetEncoding == nil { + return "UTF-8", false + } + + result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf)) + + if n > 2 { + return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM) + } + + return encoding, false +} + // UpdateRepoFileOptions holds the repository file update options type UpdateRepoFileOptions struct { LastCommitID string @@ -45,12 +115,29 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo return fmt.Errorf("UpdateRepoFile: %v", err) } + encoding := "UTF-8" + bom := false + if opts.IsNewFile { for _, file := range filesInIndex { if file == opts.NewTreeName { return models.ErrRepoFileAlreadyExist{FileName: opts.NewTreeName} } } + } else { + gitRepo, err := git.OpenRepository(t.basePath) + if err != nil { + return err + } + tree, err := gitRepo.GetTree("HEAD") + if err != nil { + return err + } + entry, err := tree.GetTreeEntryByPath(opts.OldTreeName) + if err != nil { + return err + } + encoding, bom = detectEncodingAndBOM(entry, repo) } //var stdout string @@ -72,9 +159,28 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo } content := opts.Content + if bom { + content = string(base.UTF8BOM) + content + } + if encoding != "UTF-8" { + charsetEncoding, _ := charset.Lookup(encoding) + if charsetEncoding != nil { + result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content)) + if err != nil { + // Look if we can't encode back in to the original we should just stick with utf-8 + log.Error(4, "Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.NewTreeName, opts.OldTreeName, encoding, err) + result = content + } + content = result + } else { + log.Error(4, "Unknown encoding: %s", encoding) + } + } + // Reset the opts.Content with the re-encoded and BOM'd content + opts.Content = content var lfsMetaObject *models.LFSMetaObject - if filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" { + if setting.LFS.StartServer && filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" { // OK so we are supposed to LFS this data! oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content)) if err != nil { |