aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzeripath2019-04-28 00:36:12 +0100
committertechknowlogick2019-04-27 19:36:12 -0400
commit21fb7917473773a5b9c9d52fbcdbfbcc9fa94a22 (patch)
tree2cf833a52a4bce3e2ac83afc8ecfafb9678a21d6
parent8b3aad940e915b9db11deb0f06d9e5338cfe3fdd (diff)
Detect encoding and BOM in content (#6727) (#6765)
Detect and remove a decoded BOM when showing content. Restore the previous encoding and BOM when updating content. On error keep as UTF-8 encoding. Signed-off-by: Andrew Thornton <art27@cantab.net>
-rw-r--r--modules/base/tool.go12
-rw-r--r--modules/templates/helper.go14
-rw-r--r--modules/uploader/update.go108
3 files changed, 127 insertions, 7 deletions
diff --git a/modules/base/tool.go b/modules/base/tool.go
index b069e5fae..4dd7ba59b 100644
--- a/modules/base/tool.go
+++ b/modules/base/tool.go
@@ -5,6 +5,7 @@
package base
import (
+ "bytes"
"crypto/md5"
"crypto/rand"
"crypto/sha1"
@@ -32,6 +33,9 @@ import (
"github.com/gogits/chardet"
)
+// UTF8BOM is the utf-8 byte-order marker
+var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
+
// EncodeMD5 encodes string to md5 hex value.
func EncodeMD5(str string) string {
m := md5.New()
@@ -87,6 +91,14 @@ func DetectEncoding(content []byte) (string, error) {
return result.Charset, err
}
+// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
+func RemoveBOMIfPresent(content []byte) []byte {
+ if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
+ return content[3:]
+ }
+ return content
+}
+
// BasicAuthDecode decode basic auth string
func BasicAuthDecode(encoded string) (string, string, error) {
s, err := base64.StdEncoding.DecodeString(encoded)
diff --git a/modules/templates/helper.go b/modules/templates/helper.go
index ce077d1a9..84952d14e 100644
--- a/modules/templates/helper.go
+++ b/modules/templates/helper.go
@@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
- return string(content), nil
+ return string(base.RemoveBOMIfPresent(content)), nil
}
encoding, _ := charset.Lookup(charsetLabel)
@@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) {
// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
- result, n, err := transform.String(encoding.NewDecoder(), string(content))
+ result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
- result = result + string(content[n:])
+ result = append(result, content[n:]...)
}
- return result, err
+ result = base.RemoveBOMIfPresent(result)
+
+ return string(result), err
}
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
- return content
+ return base.RemoveBOMIfPresent(content)
}
encoding, _ := charset.Lookup(charsetLabel)
@@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte {
return append(result, content[n:]...)
}
- return result
+ return base.RemoveBOMIfPresent(result)
}
// ToUTF8 converts content to UTF8 encoding and ignore error
diff --git a/modules/uploader/update.go b/modules/uploader/update.go
index 08caf11ee..fe85cbf47 100644
--- a/modules/uploader/update.go
+++ b/modules/uploader/update.go
@@ -5,15 +5,85 @@
package uploader
import (
+ "bytes"
"fmt"
"strings"
+ "golang.org/x/net/html/charset"
+ "golang.org/x/text/transform"
+
"code.gitea.io/git"
"code.gitea.io/gitea/models"
+ "code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/lfs"
+ "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)
+func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) {
+ reader, err := entry.Blob().DataAsync()
+ if err != nil {
+ // return default
+ return "UTF-8", false
+ }
+ defer reader.Close()
+ buf := make([]byte, 1024)
+ n, err := reader.Read(buf)
+ if err != nil {
+ // return default
+ return "UTF-8", false
+ }
+ buf = buf[:n]
+
+ if setting.LFS.StartServer {
+ meta := lfs.IsPointerFile(&buf)
+ if meta != nil {
+ meta, err = repo.GetLFSMetaObjectByOid(meta.Oid)
+ if err != nil && err != models.ErrLFSObjectNotExist {
+ // return default
+ return "UTF-8", false
+ }
+ }
+ if meta != nil {
+ dataRc, err := lfs.ReadMetaObject(meta)
+ if err != nil {
+ // return default
+ return "UTF-8", false
+ }
+ defer dataRc.Close()
+ buf = make([]byte, 1024)
+ n, err = dataRc.Read(buf)
+ if err != nil {
+ // return default
+ return "UTF-8", false
+ }
+ buf = buf[:n]
+ }
+
+ }
+
+ encoding, err := base.DetectEncoding(buf)
+ if err != nil {
+ // just default to utf-8 and no bom
+ return "UTF-8", false
+ }
+ if encoding == "UTF-8" {
+ return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
+ }
+ charsetEncoding, _ := charset.Lookup(encoding)
+ if charsetEncoding == nil {
+ return "UTF-8", false
+ }
+
+ result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
+
+ if n > 2 {
+ return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
+ }
+
+ return encoding, false
+}
+
// UpdateRepoFileOptions holds the repository file update options
type UpdateRepoFileOptions struct {
LastCommitID string
@@ -45,12 +115,29 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo
return fmt.Errorf("UpdateRepoFile: %v", err)
}
+ encoding := "UTF-8"
+ bom := false
+
if opts.IsNewFile {
for _, file := range filesInIndex {
if file == opts.NewTreeName {
return models.ErrRepoFileAlreadyExist{FileName: opts.NewTreeName}
}
}
+ } else {
+ gitRepo, err := git.OpenRepository(t.basePath)
+ if err != nil {
+ return err
+ }
+ tree, err := gitRepo.GetTree("HEAD")
+ if err != nil {
+ return err
+ }
+ entry, err := tree.GetTreeEntryByPath(opts.OldTreeName)
+ if err != nil {
+ return err
+ }
+ encoding, bom = detectEncodingAndBOM(entry, repo)
}
//var stdout string
@@ -72,9 +159,28 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo
}
content := opts.Content
+ if bom {
+ content = string(base.UTF8BOM) + content
+ }
+ if encoding != "UTF-8" {
+ charsetEncoding, _ := charset.Lookup(encoding)
+ if charsetEncoding != nil {
+ result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
+ if err != nil {
+ // Look if we can't encode back in to the original we should just stick with utf-8
+ log.Error(4, "Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.NewTreeName, opts.OldTreeName, encoding, err)
+ result = content
+ }
+ content = result
+ } else {
+ log.Error(4, "Unknown encoding: %s", encoding)
+ }
+ }
+ // Reset the opts.Content with the re-encoded and BOM'd content
+ opts.Content = content
var lfsMetaObject *models.LFSMetaObject
- if filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" {
+ if setting.LFS.StartServer && filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" {
// OK so we are supposed to LFS this data!
oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
if err != nil {