jhillyerd / enmime

MIME mail encoding and decoding package for Go
MIT License
464 stars 100 forks source link

Feature: Parse headers and stop #64

Closed jhillyerd closed 5 years ago

jhillyerd commented 6 years ago

Inbucket needs to read message headers during delivery, but doesn't need to parse the entire message until somebody tried to view it.

Go's built in header parsing doesn't handle encoded headers. It would be nice if enmime could just parse the headers and return them.

requaos commented 5 years ago

@jhillyerd what type of encoding? Do you have a requirement to decode RFC2047 QP and B64 header values to plain-text?

requaos commented 5 years ago

I wrote this a while back to preserve the ordinality of the headers with decoded values, however the resultant Decoded []byte value is for humans, ie- it's unicode and doesn't respect folding-whitespace or line-length requirements.

type HeadersPreserved struct {
    Decoded []byte
    Headers Headers
}

type Headers []Header

type Header struct {
    Name  string
    Value string
}

func Sort(b []byte) (*HeadersPreserved, error) {
    b = Clean(b)
    tr := textproto.NewReader(bufio.NewReader(bytes.NewReader(b)))
    headers, err := tr.ReadMIMEHeader()
    switch errors.Cause(err) {
    case nil, io.EOF:
    // carry on, io.EOF is expected
    default:
        return nil, err
    }
    bs := bufio.NewScanner(bufio.NewReader(bytes.NewReader(b)))
    res := Headers{}
    bw := &bytes.Buffer{}
    for bs.Scan() {
        line := bs.Text()
        if strings.HasPrefix(line, " ") || strings.HasPrefix(line, "\t") {
            continue
        }
        i := strings.Index(line, ":")
        if i == -1 {
            continue
        }
        header := textproto.CanonicalMIMEHeaderKey(line[:i])
        if len(headers[header]) == 0 {
            // somethings up, we should have already processed all of these, so why are we trying to fetch from an empty list, did we miscount?
            continue
        }
        // pop
        firstValue := headers[header][0]
        // shift
        headers[header] = headers[header][1:]

        h := Header{Name: header, Value: RFC2047parts(firstValue)}
        res = append(res, h)
        bw.WriteString(fmt.Sprintf("%s: %s\r\n", h.Name, h.Value))
    }
    bw.WriteString("\r\n")

    return &HeadersPreserved{
        Decoded: bw.Bytes(),
        Headers: res,
    }, nil
}

func Clean(b []byte) []byte {
    slice := bytes.SplitAfter(b, []byte{'\r', '\n'})
    dest := make([]byte, 0, len(b)+2)
    headers := true
    for _, v := range slice {
        if (bytes.Index(v, []byte{':'}) > -1 || bytes.HasPrefix(v, []byte{' '}) || bytes.HasPrefix(v, []byte{'\t'})) && headers {
            dest = append(dest, v...)
            continue
        }
        if headers {
            headers = false
            if !bytes.Equal(v, []byte{'\r', '\n'}) {
                dest = append(dest, append([]byte{'\r', '\n'}, v...)...)
                continue
            }
        }
        dest = append(dest, v...)
    }

    return dest
}

// RFC2047parts checks if the value contains content encoded in RFC2047 format
// RFC2047 Example:
//     `=?UTF-8?B?bmFtZT0iw7DCn8KUwoo=?=`
func RFC2047parts(s string) string {
    s = strings.Map(func(r rune) rune {
        if r == '\n' || r == '\r' {
            return ' '
        }
        return r
    }, s)
    var err error
    for {
        s, err = rfc2047recurse(s)
        switch err {
        case nil:
            continue
        default:
            return s
        }
    }
}

// rfc2047recurse is called for if the value contains content encoded in RFC2047 format and decodes it
// RFC2047 Example:
//     `=?UTF-8?B?bmFtZT0iw7DCn8KUwoo=?=`
func rfc2047recurse(s string) (string, error) {
    us := strings.ToUpper(s)
    if !strings.Contains(us, "?Q?") && !strings.Contains(us, "?B?") {
        return s, io.EOF
    }

    val, err := decodeHeader(s)
    if err != nil {
        return val, err
    }
    if val == s {
        val, err = decodeHeader(fixRFC2047String(val))
        if err != nil {
            return val, err
        }
        if val == s {
            return val, io.EOF
        }
    }

    return val, nil
}

// decodeHeader decodes a single line (per RFC 2047) using Golang's mime.WordDecoder
func decodeHeader(input string) (string, error) {
    dec := new(mime.WordDecoder)
    dec.CharsetReader = NewCharsetReader
    header, err := dec.DecodeHeader(input)
    if err != nil {
        return input, err
    }
    return header, nil
}

func fixRFC2047String(s string) string {
    inString := false
    eq := false
    q := 0
    sb := &strings.Builder{}
    for _, v := range s {
        switch v {
        case '=':
            if q == 3 {
                inString = false
            } else {
                eq = true
            }
            sb.WriteRune(v)
        case '?':
            if eq {
                inString = true
            } else {
                q += 1
            }
            eq = false
            sb.WriteRune(v)
        case '\n', '\r', ' ':
            if !inString {
                sb.WriteRune(v)
            }
            eq = false
        default:
            eq = false
            sb.WriteRune(v)
        }
    }
    return sb.String()
}

For the NewCharsetReader, just use the one in the enmime internal pkg...

jhillyerd commented 5 years ago

Yes, essentially human readable decoding. All I really want is for enmime to do the exact same decoding it does now when building an Envelope, but then to stop before it starts trying to process the body of the email.

requaos commented 5 years ago

@jhillyerd just let me know where you envision this being implemented and any special rules or strictures for output formatting. Got some free cycles for a week or so.

jhillyerd commented 5 years ago

So the exact problem I'm trying to solve is here:

https://github.com/inbucket/inbucket/blob/master/pkg/message/manager.go#L53

I parse an entire email with enmime, but all I care about in that scenario is the From, To and Subject in UTF-8 from the primary header.

Returning the Envelope struct isn't mandatory. Let me know if that clarifies things.