Closed jhillyerd closed 5 years ago
@jhillyerd what type of encoding? Do you have a requirement to decode RFC2047 QP and B64 header values to plain-text?
I wrote this a while back to preserve the ordinality of the headers with decoded values, however the resultant Decoded []byte
value is for humans, ie- it's unicode and doesn't respect folding-whitespace or line-length requirements.
type HeadersPreserved struct {
Decoded []byte
Headers Headers
}
type Headers []Header
type Header struct {
Name string
Value string
}
func Sort(b []byte) (*HeadersPreserved, error) {
b = Clean(b)
tr := textproto.NewReader(bufio.NewReader(bytes.NewReader(b)))
headers, err := tr.ReadMIMEHeader()
switch errors.Cause(err) {
case nil, io.EOF:
// carry on, io.EOF is expected
default:
return nil, err
}
bs := bufio.NewScanner(bufio.NewReader(bytes.NewReader(b)))
res := Headers{}
bw := &bytes.Buffer{}
for bs.Scan() {
line := bs.Text()
if strings.HasPrefix(line, " ") || strings.HasPrefix(line, "\t") {
continue
}
i := strings.Index(line, ":")
if i == -1 {
continue
}
header := textproto.CanonicalMIMEHeaderKey(line[:i])
if len(headers[header]) == 0 {
// somethings up, we should have already processed all of these, so why are we trying to fetch from an empty list, did we miscount?
continue
}
// pop
firstValue := headers[header][0]
// shift
headers[header] = headers[header][1:]
h := Header{Name: header, Value: RFC2047parts(firstValue)}
res = append(res, h)
bw.WriteString(fmt.Sprintf("%s: %s\r\n", h.Name, h.Value))
}
bw.WriteString("\r\n")
return &HeadersPreserved{
Decoded: bw.Bytes(),
Headers: res,
}, nil
}
func Clean(b []byte) []byte {
slice := bytes.SplitAfter(b, []byte{'\r', '\n'})
dest := make([]byte, 0, len(b)+2)
headers := true
for _, v := range slice {
if (bytes.Index(v, []byte{':'}) > -1 || bytes.HasPrefix(v, []byte{' '}) || bytes.HasPrefix(v, []byte{'\t'})) && headers {
dest = append(dest, v...)
continue
}
if headers {
headers = false
if !bytes.Equal(v, []byte{'\r', '\n'}) {
dest = append(dest, append([]byte{'\r', '\n'}, v...)...)
continue
}
}
dest = append(dest, v...)
}
return dest
}
// RFC2047parts checks if the value contains content encoded in RFC2047 format
// RFC2047 Example:
// `=?UTF-8?B?bmFtZT0iw7DCn8KUwoo=?=`
func RFC2047parts(s string) string {
s = strings.Map(func(r rune) rune {
if r == '\n' || r == '\r' {
return ' '
}
return r
}, s)
var err error
for {
s, err = rfc2047recurse(s)
switch err {
case nil:
continue
default:
return s
}
}
}
// rfc2047recurse is called for if the value contains content encoded in RFC2047 format and decodes it
// RFC2047 Example:
// `=?UTF-8?B?bmFtZT0iw7DCn8KUwoo=?=`
func rfc2047recurse(s string) (string, error) {
us := strings.ToUpper(s)
if !strings.Contains(us, "?Q?") && !strings.Contains(us, "?B?") {
return s, io.EOF
}
val, err := decodeHeader(s)
if err != nil {
return val, err
}
if val == s {
val, err = decodeHeader(fixRFC2047String(val))
if err != nil {
return val, err
}
if val == s {
return val, io.EOF
}
}
return val, nil
}
// decodeHeader decodes a single line (per RFC 2047) using Golang's mime.WordDecoder
func decodeHeader(input string) (string, error) {
dec := new(mime.WordDecoder)
dec.CharsetReader = NewCharsetReader
header, err := dec.DecodeHeader(input)
if err != nil {
return input, err
}
return header, nil
}
func fixRFC2047String(s string) string {
inString := false
eq := false
q := 0
sb := &strings.Builder{}
for _, v := range s {
switch v {
case '=':
if q == 3 {
inString = false
} else {
eq = true
}
sb.WriteRune(v)
case '?':
if eq {
inString = true
} else {
q += 1
}
eq = false
sb.WriteRune(v)
case '\n', '\r', ' ':
if !inString {
sb.WriteRune(v)
}
eq = false
default:
eq = false
sb.WriteRune(v)
}
}
return sb.String()
}
For the NewCharsetReader
, just use the one in the enmime internal pkg...
Yes, essentially human readable decoding. All I really want is for enmime to do the exact same decoding it does now when building an Envelope, but then to stop before it starts trying to process the body of the email.
@jhillyerd just let me know where you envision this being implemented and any special rules or strictures for output formatting. Got some free cycles for a week or so.
So the exact problem I'm trying to solve is here:
https://github.com/inbucket/inbucket/blob/master/pkg/message/manager.go#L53
I parse an entire email with enmime, but all I care about in that scenario is the From, To and Subject in UTF-8 from the primary header.
Returning the Envelope struct isn't mandatory. Let me know if that clarifies things.
Inbucket needs to read message headers during delivery, but doesn't need to parse the entire message until somebody tried to view it.
Go's built in header parsing doesn't handle encoded headers. It would be nice if enmime could just parse the headers and return them.