Skip to content

Commit

Permalink
Add multibyte character support to encoder
Browse files Browse the repository at this point in the history
  • Loading branch information
ianlopshire committed Jan 7, 2022
1 parent 23e2fec commit aa2bbf6
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 56 deletions.
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ if err != nil {
fmt.Printf("%s", data)
// Output:
// 1 Ian Lopshire 99.5020 true

```

### Decode
Expand Down Expand Up @@ -95,15 +94,25 @@ for {
}
```

If you have an input where the indices are expressed in unicode codepoints, and
not raw bytes fixedwidth supports this. Your data must be UTF-8 encoded:
### UTF-8, Codepoints, and Multibyte Characters

fixedwidth supports encoding and decoding fixed-width data where indices are expressed in
unicode codepoints and not raw bytes. The data must be UTF-8 encoded.

```go
decoder := fixedwidth.NewDecoder(strings.NewReader(data))
decoder.SetUseCodepointIndices(true)
// Decode as usual now
```


```go
buff := new(bytes.Buffer)
encoder := fixedwidth.NewEncoder(buff)
encoder.SetUseCodepointIndices(true)
// Encode as usual now
```

### Alignment Behavior

| Alignment | Encoding | Decoding |
Expand Down
140 changes: 90 additions & 50 deletions encode.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"io"
"reflect"
"strconv"
"strings"
)

// Marshal returns the fixed-width encoding of v.
Expand Down Expand Up @@ -60,6 +61,8 @@ func (e *MarshalInvalidTypeError) Error() string {
type Encoder struct {
w *bufio.Writer
lineTerminator []byte

useCodepointIndices bool
}

// NewEncoder returns a new encoder that writes to w.
Expand All @@ -77,6 +80,13 @@ func (e *Encoder) SetLineTerminator(lineTerminator []byte) {
e.lineTerminator = lineTerminator
}

// SetUseCodepointIndices configures `Encoder` on whether the indices in the
// `fixedwidth` struct tags are expressed in terms of bytes (the default
// behavior) or in terms of UTF-8 decoded codepoints.
func (e *Encoder) SetUseCodepointIndices(use bool) {
e.useCodepointIndices = use
}

// Encode writes the fixed-width encoding of v to the
// stream.
// See the documentation for Marshal for details about
Expand Down Expand Up @@ -122,31 +132,31 @@ func (e *Encoder) writeLines(v reflect.Value) error {
}

func (e *Encoder) writeLine(v reflect.Value) (err error) {
b, err := newValueEncoder(v.Type())(v)
b, err := newValueEncoder(v.Type(), e.useCodepointIndices)(v)
if err != nil {
return err
}
_, err = e.w.Write(b)
_, err = e.w.WriteString(b.data)
return err
}

type valueEncoder func(v reflect.Value) ([]byte, error)
type valueEncoder func(v reflect.Value) (rawValue, error)

func newValueEncoder(t reflect.Type) valueEncoder {
func newValueEncoder(t reflect.Type, useCodepointIndices bool) valueEncoder {
if t == nil {
return nilEncoder
}
if t.Implements(reflect.TypeOf(new(encoding.TextMarshaler)).Elem()) {
return textMarshalerEncoder
return textMarshalerEncoder(useCodepointIndices)
}

switch t.Kind() {
case reflect.Ptr, reflect.Interface:
return ptrInterfaceEncoder
return ptrInterfaceEncoder(useCodepointIndices)
case reflect.Struct:
return structEncoder
return structEncoder(useCodepointIndices)
case reflect.String:
return stringEncoder
return stringEncoder(useCodepointIndices)
case reflect.Int, reflect.Int64, reflect.Int32, reflect.Int16, reflect.Int8:
return intEncoder
case reflect.Float64:
Expand All @@ -161,93 +171,123 @@ func newValueEncoder(t reflect.Type) valueEncoder {
return unknownTypeEncoder(t)
}

func (ve valueEncoder) Write(v reflect.Value, dst []byte, format format) error {
func (ve valueEncoder) Write(b *lineBuilder, v reflect.Value, spec fieldSpec) error {
format := spec.format
startIndex := spec.startPos - 1
value, err := ve(v)
if err != nil {
return err
}

if len(value) < len(dst) {
if value.len() < spec.len() {
switch {
case format.alignment == right:
padding := bytes.Repeat([]byte{format.padChar}, len(dst)-len(value))
copy(dst, padding)
copy(dst[len(padding):], value)
case spec.format.alignment == right:
padding := strings.Repeat(string(format.padChar), spec.len()-value.len())
b.WriteASCII(startIndex, padding)
b.WriteValue(startIndex+len(padding), value)
return nil

// The second case in this block is a special case to maintain backward
// compatibility. In previous versions of the library, only len(value) bytes were
// written to dst. This means overlapping intervals can, in effect, be used to
// coalesce a value.
case format.alignment == left, format.alignment == defaultAlignment && format.padChar != ' ':
padding := bytes.Repeat([]byte{format.padChar}, len(dst)-len(value))
copy(dst, value)
copy(dst[len(value):], padding)
padding := strings.Repeat(string(format.padChar), spec.len()-value.len())

b.WriteValue(startIndex, value)
b.WriteASCII(startIndex+value.len(), padding)
return nil
}
}

copy(dst, value)
if value.len() > spec.len() {
// If the value is too long it needs to be trimmed.
// TODO: Add strict mode that returns in this case.
value, err = value.slice(0, spec.len()-1)
if err != nil {
return err
}
}

b.WriteValue(startIndex, value)
return nil
}

func structEncoder(v reflect.Value) ([]byte, error) {
ss := cachedStructSpec(v.Type())
dst := bytes.Repeat([]byte(" "), ss.ll)
func structEncoder(useCodepointIndices bool) valueEncoder {
return func(v reflect.Value) (rawValue, error) {
ss := cachedStructSpec(v.Type())

for i, spec := range ss.fieldSpecs {
if !spec.ok {
continue
// Add a 10% headroom to the builder when codepoint indices are being used.
c := ss.ll
if useCodepointIndices {
c = int(1.1*float64(ss.ll)) + 1
}
b := newLineBuilder(ss.ll, c, ' ')

err := spec.encoder.Write(v.Field(i), dst[spec.startPos-1:spec.endPos:spec.endPos], spec.format)
if err != nil {
return nil, err
for i, spec := range ss.fieldSpecs {
if !spec.ok {
continue
}

enc := spec.getEncoder(useCodepointIndices)
err := enc.Write(b, v.Field(i), spec)
if err != nil {
return rawValue{}, err
}
}
}

return dst, nil
return b.AsRawValue(), nil
}
}

func textMarshalerEncoder(v reflect.Value) ([]byte, error) {
return v.Interface().(encoding.TextMarshaler).MarshalText()
func textMarshalerEncoder(useCodepointIndices bool) valueEncoder {
return func(v reflect.Value) (rawValue, error) {
txt, err := v.Interface().(encoding.TextMarshaler).MarshalText()
if err != nil {
return rawValue{}, err
}
return newRawValue(string(txt), useCodepointIndices)
}
}

func ptrInterfaceEncoder(v reflect.Value) ([]byte, error) {
if v.IsNil() {
return nilEncoder(v)
func ptrInterfaceEncoder(useCodepointIndices bool) valueEncoder {
return func(v reflect.Value) (rawValue, error) {
if v.IsNil() {
return nilEncoder(v)
}
return newValueEncoder(v.Elem().Type(), useCodepointIndices)(v.Elem())
}
return newValueEncoder(v.Elem().Type())(v.Elem())
}

func stringEncoder(v reflect.Value) ([]byte, error) {
return []byte(v.String()), nil
func stringEncoder(useCodepointIndices bool) valueEncoder {
return func(v reflect.Value) (rawValue, error) {
return newRawValue(v.String(), useCodepointIndices)
}
}

func intEncoder(v reflect.Value) ([]byte, error) {
return []byte(strconv.Itoa(int(v.Int()))), nil
func intEncoder(v reflect.Value) (rawValue, error) {
return newRawValue(strconv.Itoa(int(v.Int())), false)
}

func floatEncoder(perc, bitSize int) valueEncoder {
return func(v reflect.Value) ([]byte, error) {
return []byte(strconv.FormatFloat(v.Float(), 'f', perc, bitSize)), nil
return func(v reflect.Value) (rawValue, error) {
return newRawValue(strconv.FormatFloat(v.Float(), 'f', perc, bitSize), false)
}
}

func boolEncoder(v reflect.Value) ([]byte, error) {
return []byte(strconv.FormatBool(v.Bool())), nil
func boolEncoder(v reflect.Value) (rawValue, error) {
return newRawValue(strconv.FormatBool(v.Bool()), false)
}

func nilEncoder(v reflect.Value) ([]byte, error) {
return nil, nil
func nilEncoder(_ reflect.Value) (rawValue, error) {
return rawValue{}, nil
}

func unknownTypeEncoder(t reflect.Type) valueEncoder {
return func(value reflect.Value) ([]byte, error) {
return nil, &MarshalInvalidTypeError{typeName: t.Name()}
return func(value reflect.Value) (rawValue, error) {
return rawValue{}, &MarshalInvalidTypeError{typeName: t.Name()}
}
}

func uintEncoder(v reflect.Value) ([]byte, error) {
return []byte(strconv.FormatUint(v.Uint(), 10)), nil
func uintEncoder(v reflect.Value) (rawValue, error) {
return newRawValue(strconv.FormatUint(v.Uint(), 10), false)
}
77 changes: 75 additions & 2 deletions encode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ func TestMarshal(t *testing.T) {
F1 interface{} `fixed:"1,5"`
F2 interface{} `fixed:"6,10"`
}
type H2 struct {
F1 bool `fixed:"1,1"`
}
tagHelper := struct {
Valid string `fixed:"1,5"`
NoTags string
Expand All @@ -76,6 +79,7 @@ func TestMarshal(t *testing.T) {
}{
{"single line", H{"foo", 1}, []byte("foo 1 "), false},
{"multiple line", []H{{"foo", 1}, {"bar", 2}}, []byte("foo 1 \nbar 2 "), false},
{"multiple line (diff struct)", []interface{}{H{"foo", 1}, H2{false}}, []byte("foo 1 \nf"), false},
{"empty slice", []H{}, nil, false},
{"pointer", &H{"foo", 1}, []byte("foo 1 "), false},
{"nil", nil, nil, false},
Expand All @@ -90,6 +94,59 @@ func TestMarshal(t *testing.T) {
t.Errorf("Marshal() shouldErr expected %v, have %v (%v)", tt.shouldErr, err != nil, err)
}
if !tt.shouldErr && !bytes.Equal(o, tt.o) {
t.Errorf("Marshal() expected %q, have %q", string(tt.o), string(o))
}

// All tests should also pass with codepoint indices enabled.
t.Run("use codepoint indices", func(t *testing.T) {
buff := bytes.NewBuffer(nil)
enc := NewEncoder(buff)
enc.SetUseCodepointIndices(true)
err := enc.Encode(tt.i)
if tt.shouldErr != (err != nil) {
t.Errorf("Marshal() shouldErr expected %v, have %v (%v)", tt.shouldErr, err != nil, err)
}
if !tt.shouldErr && !bytes.Equal(buff.Bytes(), tt.o) {
t.Errorf("Marshal() expected %q, have %q", string(tt.o), string(o))
}
})

})
}
}

func TestMarshal_useCodepointIndices(t *testing.T) {
type H struct {
F1 string `fixed:"1,5"`
F2 string `fixed:"6,10"`
F3 string `fixed:"11,15"`
}

type HF struct {
F1 string `fixed:"1,5,right,#"`
F2 string `fixed:"6,10,left,#"`
F3 string `fixed:"11,15"`
}

for _, tt := range []struct {
name string
i interface{}
o []byte
}{
{name: "base case", i: H{"føø", "bår", "båz"}, o: []byte(`føø bår båz `)},
{name: "overflow", i: H{"føøøøøøøøøø", "bååååååååår", "bååååååååz"}, o: []byte(`føøøøbååååbåååå`)},
{name: "formatted", i: HF{"føø", "bår", "båz"}, o: []byte(`##føøbår##båz `)},
{name: "multibformatted overflow", i: HF{"føøøøøøøøøø", "bååååååååår", "bååååååååz"}, o: []byte(`føøøøbååååbåååå`)},
} {
t.Run(tt.name, func(t *testing.T) {
buff := bytes.NewBuffer(nil)
enc := NewEncoder(buff)
enc.SetUseCodepointIndices(true)
if err := enc.Encode(tt.i); err != nil {
t.Errorf("Marshal() unexpected error: %v", err)
return
}
if o := buff.Bytes(); !bytes.Equal(o, tt.o) {
t.Errorf("Marshal() expected %s, have %s", tt.o, o)
}
})
Expand Down Expand Up @@ -130,6 +187,22 @@ func TestMarshal_format(t *testing.T) {
want: []byte(`12345` + `12345` + `12345` + `12345` + `12345` + `12345`),
shouldErr: false,
},
{
name: "pad right",
v: struct {
F1 string `fixed:"1,5,right,#"`
}{"foo"},
want: []byte(`##foo`),
shouldErr: false,
},
{
name: "pad left",
v: struct {
F1 string `fixed:"1,5,left,#"`
}{"foo"},
want: []byte(`foo##`),
shouldErr: false,
},
} {
t.Run(tt.name, func(t *testing.T) {
have, err := Marshal(tt.v)
Expand Down Expand Up @@ -228,11 +301,11 @@ func TestNewValueEncoder(t *testing.T) {
{"*uint nil", nilUint, []byte(""), false},
} {
t.Run(tt.name, func(t *testing.T) {
o, err := newValueEncoder(reflect.TypeOf(tt.i))(reflect.ValueOf(tt.i))
o, err := newValueEncoder(reflect.TypeOf(tt.i), false)(reflect.ValueOf(tt.i))
if tt.shouldErr != (err != nil) {
t.Errorf("newValueEncoder(%s)() shouldErr expected %v, have %v (%v)", reflect.TypeOf(tt.i).Name(), tt.shouldErr, err != nil, err)
}
if !tt.shouldErr && !bytes.Equal(o, tt.o) {
if !tt.shouldErr && !bytes.Equal([]byte(o.data), tt.o) {
t.Errorf("newValueEncoder(%s)() expected %v, have %v", reflect.TypeOf(tt.i).Name(), tt.o, o)
}
})
Expand Down
Loading

0 comments on commit aa2bbf6

Please sign in to comment.