Skip to content

Commit

Permalink
BED-4463: Implement BOM encoding package and refactor WriteAndValidat…
Browse files Browse the repository at this point in the history
…eJSON for improved Unicode handling (#678)

feat(bed-4463): Implement BOM encoding detection and UTF conversion

This commit introduces a robust package for handling Byte Order Mark (BOM) encoding detection and UTF conversions, focusing on UTF-8, UTF-16 (BE/LE), and UTF-32 (BE/LE). It also refactors the WriteAndValidateJSON function to use this new package.

Key changes and features:

1. Encoding Interface: 
   - Introduced a unified Encoding interface with String(), Sequence(), and HasSequence() methods.

2. BOM Detection and UTF Conversion:
   - Implemented precise BOM detection for multiple encodings.
   - Added UTF-16 and UTF-32 to UTF-8 conversion with proper surrogate pair handling for UTF-16.
   - Utilized efficient bitwise operations for byte manipulation and endianness handling.

3. WriteAndValidateJSON Refactor:
   - Now uses bomenc.NormalizeToUTF8 for BOM detection and removal.
   - Normalizes input to UTF-8 for consistent JSON processing.
   - Simplified logic and improved error handling.

4. Testing:
   - Added comprehensive test suites for encoding detection, conversion, and edge cases.
   - Updated tests for WriteAndValidateJSON to reflect the new functionality.

5. Documentation:
   - Added detailed comments explaining bitwise operations and the rationale behind implementation choices.

This implementation enhances our handling of text encodings across the codebase, particularly for IO operations with potentially BOM-prefixed content. It improves robustness and maintainability and sets a foundation for future Unicode-related features.

Note: The current implementation of ValidateMetaTag does not invalidate JSON data with syntax errors, which are reflected in the updated tests.
  • Loading branch information
neumachen committed Jul 2, 2024
1 parent 0b755fd commit b5bf10d
Show file tree
Hide file tree
Showing 16 changed files with 1,658 additions and 72 deletions.
17 changes: 10 additions & 7 deletions cmd/api/src/api/v2/file_uploads.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,24 @@ package v2
import (
"errors"
"fmt"
"mime"
"net/http"
"slices"
"strconv"
"strings"

"github.com/gorilla/mux"
"github.com/specterops/bloodhound/bomenc"
"github.com/specterops/bloodhound/headers"
"github.com/specterops/bloodhound/log"
"github.com/specterops/bloodhound/src/api"
"github.com/specterops/bloodhound/src/auth"
"github.com/specterops/bloodhound/src/ctx"
"github.com/specterops/bloodhound/src/model"
ingestModel "github.com/specterops/bloodhound/src/model/ingest"
"github.com/specterops/bloodhound/src/services/fileupload"
"github.com/specterops/bloodhound/src/services/ingest"
"mime"
"net/http"
"slices"
"strconv"
"strings"

ingestModel "github.com/specterops/bloodhound/src/model/ingest"
)

const FileUploadJobIdPathParameterName = "file_upload_job_id"
Expand Down Expand Up @@ -135,7 +138,7 @@ func (s Resources) ProcessFileUpload(response http.ResponseWriter, request *http
api.WriteErrorResponse(request.Context(), api.BuildErrorResponse(http.StatusBadRequest, api.ErrorResponseDetailsIDMalformed, request), response)
} else if fileUploadJob, err := fileupload.GetFileUploadJobByID(request.Context(), s.DB, int64(fileUploadJobID)); err != nil {
api.HandleDatabaseError(request, response, err)
} else if fileName, fileType, err := fileupload.SaveIngestFile(s.Config.TempDirectory(), request); errors.Is(err, fileupload.ErrInvalidJSON) {
} else if fileName, fileType, err := fileupload.SaveIngestFile(s.Config.TempDirectory(), request); errors.Is(err, bomenc.ErrUnknownEncodingInvalidUTF8) {
api.WriteErrorResponse(request.Context(), api.BuildErrorResponse(http.StatusBadRequest, fmt.Sprintf("Error saving ingest file: %v", err), request), response)
} else if err != nil {
api.WriteErrorResponse(request.Context(), api.BuildErrorResponse(http.StatusInternalServerError, fmt.Sprintf("Error saving ingest file: %v", err), request), response)
Expand Down
15 changes: 8 additions & 7 deletions cmd/api/src/api/v2/file_uploads_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ import (
"bytes"
"compress/gzip"
"fmt"
"github.com/specterops/bloodhound/mediatypes"
"github.com/specterops/bloodhound/src/services/fileupload"
"io"
"net/http"
"testing"

"github.com/specterops/bloodhound/bomenc"
"github.com/specterops/bloodhound/mediatypes"

"github.com/specterops/bloodhound/headers"
"github.com/specterops/bloodhound/src/api/v2/integration"
"github.com/specterops/bloodhound/src/test/fixtures/fixtures"
Expand Down Expand Up @@ -169,7 +170,7 @@ func Test_FileUploadWorkFlowVersion5(t *testing.T) {
"v5/ingest/sessions.json",
})

//Assert that we created stuff we expected
// Assert that we created stuff we expected
testCtx.AssertIngest(fixtures.IngestAssertions)
}

Expand All @@ -188,7 +189,7 @@ func Test_FileUploadWorkFlowVersion6(t *testing.T) {
"v6/ingest/sessions.json",
})

//Assert that we created stuff we expected
// Assert that we created stuff we expected
testCtx.AssertIngest(fixtures.IngestAssertions)
testCtx.AssertIngest(fixtures.IngestAssertionsv6)
testCtx.AssertIngest(fixtures.PropertyAssertions)
Expand Down Expand Up @@ -239,7 +240,7 @@ func Test_CompressedFileUploadWorkFlowVersion5(t *testing.T) {
"v5/ingest/sessions.json",
})

//Assert that we created stuff we expected
// Assert that we created stuff we expected
testCtx.AssertIngest(fixtures.IngestAssertions)
testCtx.AssertIngest(fixtures.PropertyAssertions)
}
Expand All @@ -259,7 +260,7 @@ func Test_CompressedFileUploadWorkFlowVersion6(t *testing.T) {
"v6/ingest/sessions.json",
})

//Assert that we created stuff we expected
// Assert that we created stuff we expected
testCtx.AssertIngest(fixtures.IngestAssertions)
testCtx.AssertIngest(fixtures.IngestAssertionsv6)
testCtx.AssertIngest(fixtures.PropertyAssertions)
Expand All @@ -268,5 +269,5 @@ func Test_CompressedFileUploadWorkFlowVersion6(t *testing.T) {
func Test_BadFileUploadError(t *testing.T) {
testCtx := integration.NewFOSSContext(t)

testCtx.SendInvalidFileIngest("v6/ingest/jker.jpg", fileupload.ErrInvalidJSON)
testCtx.SendInvalidFileIngest("v6/ingest/jker.jpg", bomenc.ErrUnknownEncodingInvalidUTF8)
}
34 changes: 14 additions & 20 deletions cmd/api/src/services/fileupload/file_upload.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,27 @@ package fileupload

import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
"github.com/specterops/bloodhound/headers"
"github.com/specterops/bloodhound/mediatypes"
"github.com/specterops/bloodhound/src/model/ingest"
"github.com/specterops/bloodhound/src/utils"
"io"
"net/http"
"os"
"time"

"github.com/specterops/bloodhound/bomenc"
"github.com/specterops/bloodhound/headers"
"github.com/specterops/bloodhound/mediatypes"
"github.com/specterops/bloodhound/src/model/ingest"
"github.com/specterops/bloodhound/src/utils"

"github.com/specterops/bloodhound/log"
"github.com/specterops/bloodhound/src/model"
)

const jobActivityTimeout = time.Minute * 20

const (
UTF8BOM1 = 0xef
UTF8BOM2 = 0xbb
UTF8BMO3 = 0xbf
)

var ErrInvalidJSON = errors.New("file is not valid json")

type FileUploadData interface {
Expand Down Expand Up @@ -120,17 +117,14 @@ func WriteAndValidateZip(src io.Reader, dst io.Writer) error {

func WriteAndValidateJSON(src io.Reader, dst io.Writer) error {
tr := io.TeeReader(src, dst)
bufReader := bufio.NewReader(tr)
if b, err := bufReader.Peek(3); err != nil {
normalizedContent, err := bomenc.NormalizeToUTF8(bufio.NewReader(tr))
if err != nil {
return err
} else {
if b[0] == UTF8BOM1 && b[1] == UTF8BOM2 && b[2] == UTF8BMO3 {
if _, err := bufReader.Discard(3); err != nil {
return err
}
}
}
_, err := ValidateMetaTag(bufReader, true)
_, err = ValidateMetaTag(
bytes.NewReader(normalizedContent.NormalizedContent()),
true,
)
return err
}

Expand All @@ -146,7 +140,7 @@ func SaveIngestFile(location string, request *http.Request) (string, model.FileT
} else if utils.HeaderMatches(request.Header, headers.ContentType.String(), ingest.AllowedZipFileUploadTypes...) {
return tempFile.Name(), model.FileTypeZip, WriteAndValidateFile(fileData, tempFile, WriteAndValidateZip)
} else {
//We should never get here since this is checked a level above
// We should never get here since this is checked a level above
return "", model.FileTypeJson, fmt.Errorf("invalid content type for ingest file")
}
}
Expand Down
124 changes: 88 additions & 36 deletions cmd/api/src/services/fileupload/file_upload_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,50 +18,19 @@ package fileupload

import (
"bytes"
"github.com/specterops/bloodhound/src/model/ingest"
"github.com/stretchr/testify/assert"
"errors"
"io"
"os"
"strings"
"testing"
)

func TestWriteAndValidateJSON(t *testing.T) {
t.Run("trigger invalid json on bad json", func(t *testing.T) {
var (
writer = bytes.Buffer{}
badJSON = strings.NewReader("{[]}")
)
err := WriteAndValidateJSON(badJSON, &writer)
assert.ErrorIs(t, err, ErrInvalidJSON)
})

t.Run("succeed on good json", func(t *testing.T) {
var (
writer = bytes.Buffer{}
goodJSON = strings.NewReader(`{"meta": {"methods": 0, "type": "sessions", "count": 0, "version": 5}, "data": []}`)
)
err := WriteAndValidateJSON(goodJSON, &writer)
assert.Nil(t, err)
})

t.Run("succeed on utf-8 BOM json", func(t *testing.T) {
var (
writer = bytes.Buffer{}
)

file, err := os.Open("../../test/fixtures/fixtures/utf8bomjson.json")
assert.Nil(t, err)
err = WriteAndValidateJSON(io.Reader(file), &writer)
assert.Nil(t, err)
})
}
"github.com/specterops/bloodhound/src/model/ingest"
"github.com/stretchr/testify/assert"
)

func TestWriteAndValidateZip(t *testing.T) {
t.Run("valid zip file is ok", func(t *testing.T) {
var (
writer = bytes.Buffer{}
)
writer := bytes.Buffer{}

file, err := os.Open("../../test/fixtures/fixtures/goodzip.zip")
assert.Nil(t, err)
Expand All @@ -80,3 +49,86 @@ func TestWriteAndValidateZip(t *testing.T) {
assert.Equal(t, err, ingest.ErrInvalidZipFile)
})
}

func TestWriteAndValidateJSON(t *testing.T) {
tests := []struct {
name string
input []byte
expectedOutput []byte
expectedError error
}{
{
name: "UTF-8 without BOM",
input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`),
expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`),
expectedError: nil,
},
{
name: "UTF-8 with BOM",
input: append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`)...),
expectedOutput: append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`)...),
expectedError: nil,
},
{
name: "UTF-16BE with BOM",
input: []byte{0xFE, 0xFF, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x6D, 0x00, 0x65, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x74, 0x00, 0x79, 0x00, 0x70, 0x00, 0x65, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x73, 0x00, 0x22, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x76, 0x00, 0x65, 0x00, 0x72, 0x00, 0x73, 0x00, 0x69, 0x00, 0x6F, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x34, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x75, 0x00, 0x6E, 0x00, 0x74, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x31, 0x00, 0x7D, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x61, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x5B, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x65, 0x00, 0x78, 0x00, 0x61, 0x00, 0x6D, 0x00, 0x70, 0x00, 0x6C, 0x00, 0x65, 0x00, 0x2E, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x22, 0x00, 0x7D, 0x00, 0x5D, 0x00, 0x7D},
expectedOutput: []byte{0xFE, 0xFF, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x6D, 0x00, 0x65, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x74, 0x00, 0x79, 0x00, 0x70, 0x00, 0x65, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x73, 0x00, 0x22, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x76, 0x00, 0x65, 0x00, 0x72, 0x00, 0x73, 0x00, 0x69, 0x00, 0x6F, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x34, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x75, 0x00, 0x6E, 0x00, 0x74, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x31, 0x00, 0x7D, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x61, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x5B, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x65, 0x00, 0x78, 0x00, 0x61, 0x00, 0x6D, 0x00, 0x70, 0x00, 0x6C, 0x00, 0x65, 0x00, 0x2E, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x22, 0x00, 0x7D, 0x00, 0x5D, 0x00, 0x7D},
expectedError: nil,
},
{
name: "Missing meta tag",
input: []byte(`{"data": [{"domain": "example.com"}]}`),
expectedOutput: []byte(`{"data": [{"domain": "example.com"}]}`),
expectedError: ingest.ErrMetaTagNotFound,
},
{
name: "Missing data tag",
input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}}`),
expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}}`),
expectedError: ingest.ErrDataTagNotFound,
},
// NOTE: this test discovers a bug where invalid JSON files are not being invalidated due to the current
// implemenation of ValidateMetaTag of decoding each token.
// {
// name: "Invalid JSON",
// input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"`),
// expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"`),
// expectedError: ErrInvalidJSON,
// },
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
src := bytes.NewReader(tt.input)
dst := &bytes.Buffer{}

err := WriteAndValidateJSON(src, dst)
if tt.expectedError != nil {
assert.Error(t, err)
assert.ErrorIs(t, err, tt.expectedError)
} else {
assert.NoError(t, err)
}
assert.Equal(t, tt.expectedOutput, dst.Bytes())
})
}
}

func TestWriteAndValidateJSON_NormalizationError(t *testing.T) {
src := &ErrorReader{err: errors.New("read error")}
dst := &bytes.Buffer{}

err := WriteAndValidateJSON(src, dst)

assert.Error(t, err)
assert.Equal(t, "read error", err.Error())
}

// ErrorReader is a mock reader that always returns an error
type ErrorReader struct {
err error
}

func (er *ErrorReader) Read(p []byte) (n int, err error) {
return 0, er.err
}
5 changes: 3 additions & 2 deletions cmd/api/src/services/fileupload/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ package fileupload
import (
"encoding/json"
"errors"
"io"

"github.com/specterops/bloodhound/log"
"github.com/specterops/bloodhound/src/model/ingest"
"io"
)

var ZipMagicBytes = []byte{0x50, 0x4b, 0x03, 0x04}
Expand Down Expand Up @@ -52,7 +53,7 @@ func ValidateMetaTag(reader io.Reader, readToEnd bool) (ingest.Metadata, error)
return ingest.Metadata{}, ErrInvalidJSON
}
} else {
//Validate that our data tag is actually opening correctly
// Validate that our data tag is actually opening correctly
if dataTagFound && !dataTagValidated {
if typed, ok := token.(json.Delim); ok && typed == ingest.DelimOpenSquareBracket {
dataTagValidated = true
Expand Down
1 change: 1 addition & 0 deletions go.work
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ go 1.21
use (
./cmd/api/src
./packages/go/analysis
./packages/go/bomenc
./packages/go/cache
./packages/go/conftool
./packages/go/crypto
Expand Down
Loading

0 comments on commit b5bf10d

Please sign in to comment.