1package message
2
3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
6// todo: handle comments in headers?
7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
8// todo: can use a cleanup
9
10import (
11 "bufio"
12 "bytes"
13 "encoding/base64"
14 "errors"
15 "fmt"
16 "io"
17 "mime"
18 "mime/quotedprintable"
19 "net/mail"
20 "net/textproto"
21 "strings"
22 "time"
23
24 "golang.org/x/text/encoding/ianaindex"
25
26 "github.com/mjl-/mox/mlog"
27 "github.com/mjl-/mox/moxio"
28 "github.com/mjl-/mox/moxvar"
29 "github.com/mjl-/mox/smtp"
30)
31
32var (
33 ErrBadContentType = errors.New("bad content-type")
34)
35
36var (
37 errNotMultipart = errors.New("not a multipart message")
38 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")
39 errLineTooLong = errors.New("line too long")
40 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")
41 errMissingClosingBoundary = errors.New("eof without closing boundary")
42 errBareLF = errors.New("invalid bare line feed")
43 errBareCR = errors.New("invalid bare carriage return")
44 errUnexpectedEOF = errors.New("unexpected eof")
45)
46
47// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.
48var enforceSequential bool
49
50// Part represents a whole mail message, or a part of a multipart message. It
51// is designed to handle IMAP requirements efficiently.
52type Part struct {
53 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.
54 HeaderOffset int64 // Offset in message file where header starts.
55 BodyOffset int64 // Offset in message file where body starts.
56 EndOffset int64 // Where body of part ends. Set when part is fully read.
57 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.
58 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.
59
60 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.
61 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".
62 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.
63 ContentID string
64 ContentDescription string
65 ContentTransferEncoding string // In upper case.
66 Envelope *Envelope // Email message headers. Not for non-message parts.
67
68 Parts []Part // Parts if this is a multipart.
69
70 // Only for message/rfc822 and message/global. This part may have a buffer as
71 // backing io.ReaderAt, because a message/global can have a non-identity
72 // content-transfer-encoding. This part has a nil parent.
73 Message *Part
74
75 r io.ReaderAt
76 header textproto.MIMEHeader // Parsed header.
77 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.
78 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.
79 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.
80 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.
81 strict bool // If set, valid crlf line endings are verified when reading body.
82}
83
84// todo: have all Content* fields in Part?
85// todo: make Address contain a type Localpart and dns.Domain?
86// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.
87
88// Envelope holds the basic/common message headers as used in IMAP4.
89type Envelope struct {
90 Date time.Time
91 Subject string // Q/B-word-decoded.
92 From []Address
93 Sender []Address
94 ReplyTo []Address
95 To []Address
96 CC []Address
97 BCC []Address
98 InReplyTo string
99 MessageID string
100}
101
102// Address as used in From and To headers.
103type Address struct {
104 Name string // Free-form name for display in mail applications.
105 User string // Localpart.
106 Host string // Domain in ASCII.
107}
108
109// Parse reads the headers of the mail message and returns a part.
110// A part provides access to decoded and raw contents of a message and its multiple parts.
111//
112// If strict is set, fewer attempts are made to continue parsing when errors are
113// encountered, such as with invalid content-type headers or bare carriage returns.
114func Parse(log *mlog.Log, strict bool, r io.ReaderAt) (Part, error) {
115 return newPart(log, strict, r, 0, nil)
116}
117
118// EnsurePart parses a part as with Parse, but ensures a usable part is always
119// returned, even if error is non-nil. If a parse error occurs, the message is
120// returned as application/octet-stream, and headers can still be read if they
121// were valid.
122//
123// If strict is set, fewer attempts are made to continue parsing when errors are
124// encountered, such as with invalid content-type headers or bare carriage returns.
125func EnsurePart(log *mlog.Log, strict bool, r io.ReaderAt, size int64) (Part, error) {
126 p, err := Parse(log, strict, r)
127 if err == nil {
128 err = p.Walk(log, nil)
129 }
130 if err != nil {
131 np, err2 := fallbackPart(p, r, size)
132 if err2 != nil {
133 err = err2
134 }
135 p = np
136 }
137 return p, err
138}
139
140func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {
141 np := Part{
142 HeaderOffset: p.HeaderOffset,
143 BodyOffset: p.BodyOffset,
144 EndOffset: size,
145 MediaType: "APPLICATION",
146 MediaSubType: "OCTET-STREAM",
147 ContentTypeParams: p.ContentTypeParams,
148 ContentID: p.ContentID,
149 ContentDescription: p.ContentDescription,
150 ContentTransferEncoding: p.ContentTransferEncoding,
151 Envelope: p.Envelope,
152 // We don't keep:
153 // - BoundaryOffset: irrelevant for top-level message.
154 // - RawLineCount and DecodedSize: set below.
155 // - Parts: we are not treating this as a multipart message.
156 }
157 np.SetReaderAt(r)
158 // By reading body, the number of lines and decoded size will be set.
159 _, err := io.Copy(io.Discard, np.Reader())
160 return np, err
161}
162
163// SetReaderAt sets r as reader for this part and all its sub parts, recursively.
164// No reader is set for any Message subpart, see SetMessageReaderAt.
165func (p *Part) SetReaderAt(r io.ReaderAt) {
166 if r == nil {
167 panic("nil reader")
168 }
169 p.r = r
170 for i := range p.Parts {
171 pp := &p.Parts[i]
172 pp.SetReaderAt(r)
173 }
174}
175
176// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.
177func (p *Part) SetMessageReaderAt() error {
178 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.
179 buf, err := io.ReadAll(p.Reader())
180 if err != nil {
181 return err
182 }
183 p.Message.SetReaderAt(bytes.NewReader(buf))
184 return nil
185}
186
187// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.
188func (p *Part) Walk(log *mlog.Log, parent *Part) error {
189 if len(p.bound) == 0 {
190 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {
191 // todo: don't read whole submessage in memory...
192 buf, err := io.ReadAll(p.Reader())
193 if err != nil {
194 return err
195 }
196 br := bytes.NewReader(buf)
197 mp, err := Parse(log, p.strict, br)
198 if err != nil {
199 return fmt.Errorf("parsing embedded message: %w", err)
200 }
201 if err := mp.Walk(log, nil); err != nil {
202 // If this is a DSN and we are not in pedantic mode, accept unexpected end of
203 // message. This is quite common because MTA's sometimes just truncate the original
204 // message in a place that makes the message invalid.
205 if errors.Is(err, errUnexpectedEOF) && !moxvar.Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {
206 mp, err = fallbackPart(mp, br, int64(len(buf)))
207 if err != nil {
208 return fmt.Errorf("parsing invalid embedded message: %w", err)
209 }
210 } else {
211 return fmt.Errorf("parsing parts of embedded message: %w", err)
212 }
213 }
214 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.
215 p.Message = &mp
216 return nil
217 }
218 _, err := io.Copy(io.Discard, p.Reader())
219 return err
220 }
221
222 for {
223 pp, err := p.ParseNextPart(log)
224 if err == io.EOF {
225 return nil
226 }
227 if err != nil {
228 return err
229 }
230 if err := pp.Walk(log, p); err != nil {
231 return err
232 }
233 }
234}
235
236// String returns a debugging representation of the part.
237func (p *Part) String() string {
238 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)
239}
240
241// newPart parses a new part, which can be the top-level message.
242// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.
243// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.
244func newPart(log *mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
245 if r == nil {
246 panic("nil reader")
247 }
248 p = Part{
249 BoundaryOffset: -1,
250 EndOffset: -1,
251 r: r,
252 parent: parent,
253 strict: strict,
254 }
255
256 b := &bufAt{strict: strict, r: r, offset: offset}
257
258 if parent != nil {
259 p.BoundaryOffset = offset
260 if line, _, err := b.ReadLine(true); err != nil {
261 return p, err
262 } else if match, finish := checkBound(line, parent.bound); !match {
263 return p, fmt.Errorf("missing bound")
264 } else if finish {
265 return p, fmt.Errorf("new part for closing boundary")
266 }
267 }
268
269 // Collect header.
270 p.HeaderOffset = b.offset
271 p.BodyOffset = b.offset
272 hb := &bytes.Buffer{}
273 for {
274 line, _, err := b.ReadLine(true)
275 if err == io.EOF {
276 // No body is valid.
277 break
278 }
279 if err != nil {
280 return p, fmt.Errorf("reading header line: %w", err)
281 }
282 hb.Write(line)
283 if len(line) == 2 {
284 break // crlf
285 }
286 }
287 p.BodyOffset = b.offset
288
289 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
290 if p.HeaderOffset == p.BodyOffset {
291 p.header = textproto.MIMEHeader{}
292 } else {
293 h, err := parseHeader(hb)
294 if err != nil {
295 return p, fmt.Errorf("parsing header: %w", err)
296 }
297 p.header = h
298 }
299
300 ct := p.header.Get("Content-Type")
301 mt, params, err := mime.ParseMediaType(ct)
302 if err != nil && ct != "" {
303 if moxvar.Pedantic || strict {
304 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
305 }
306
307 // Try parsing just a content-type, ignoring parameters.
308 // ../rfc/2045:628
309 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])
310 t := strings.SplitN(ct, "/", 2)
311 isToken := func(s string) bool {
312 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663
313 for _, c := range s {
314 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {
315 return false
316 }
317 }
318 return len(s) > 0
319 }
320 // We cannot recover content-type of multipart, we won't have a boundary.
321 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {
322 p.MediaType = strings.ToUpper(t[0])
323 p.MediaSubType = strings.ToUpper(t[1])
324 } else {
325 p.MediaType = "APPLICATION"
326 p.MediaSubType = "OCTET-STREAM"
327 }
328 log.Debugx("malformed content-type, attempting to recover and continuing", err, mlog.Field("contenttype", p.header.Get("Content-Type")), mlog.Field("mediatype", p.MediaType), mlog.Field("mediasubtype", p.MediaSubType))
329 } else if mt != "" {
330 t := strings.SplitN(strings.ToUpper(mt), "/", 2)
331 if len(t) != 2 {
332 if moxvar.Pedantic || strict {
333 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)
334 }
335 log.Debug("malformed media-type, ignoring and continuing", mlog.Field("type", mt))
336 p.MediaType = "APPLICATION"
337 p.MediaSubType = "OCTET-STREAM"
338 } else {
339 p.MediaType = t[0]
340 p.MediaSubType = t[1]
341 p.ContentTypeParams = params
342 }
343 }
344
345 p.ContentID = p.header.Get("Content-Id")
346 p.ContentDescription = p.header.Get("Content-Description")
347 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))
348
349 if parent == nil {
350 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))
351 if err != nil {
352 return p, err
353 }
354 }
355
356 if p.MediaType == "MULTIPART" {
357 s := params["boundary"]
358 if s == "" {
359 return p, errMissingBoundaryParam
360 }
361 p.bound = append([]byte("--"), s...)
362
363 // Discard preamble, before first boundary.
364 for {
365 line, _, err := b.PeekLine(true)
366 if err != nil {
367 return p, fmt.Errorf("parsing line for part preamble: %w", err)
368 }
369 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103
370 // Well, for compatibility, we require whitespace after the boundary. Because some
371 // software use the same boundary but with text appended for sub parts.
372 if match, finish := checkBound(line, p.bound); match {
373 if finish {
374 return p, errFirstBoundCloses
375 }
376 break
377 }
378 b.ReadLine(true)
379 }
380 p.nextBoundOffset = b.offset
381 p.lastBoundOffset = b.offset
382 }
383
384 return p, nil
385}
386
387// Header returns the parsed header of this part.
388func (p *Part) Header() (textproto.MIMEHeader, error) {
389 if p.header != nil {
390 return p.header, nil
391 }
392 if p.HeaderOffset == p.BodyOffset {
393 p.header = textproto.MIMEHeader{}
394 return p.header, nil
395 }
396 h, err := parseHeader(p.HeaderReader())
397 p.header = h
398 return h, err
399}
400
401// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.
402func (p *Part) HeaderReader() io.Reader {
403 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
404}
405
406// parse a header, only call this on non-empty input (even though that is a valid header).
407func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
408 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
409 // first handles email messages properly, while the second only works for HTTP
410 // headers.
411 var zero textproto.MIMEHeader
412
413 // We read the header and add the optional \r\n header/body separator. If the \r\n
414 // is missing, parsing with Go <1.21 results in an EOF error.
415 // todo: directly parse from reader r when Go 1.20 is no longer supported.
416 buf, err := io.ReadAll(r)
417 if err != nil {
418 return zero, err
419 }
420 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {
421 buf = append(buf, "\r\n"...)
422 }
423 msg, err := mail.ReadMessage(bytes.NewReader(buf))
424 if err != nil {
425 return zero, err
426 }
427 return textproto.MIMEHeader(msg.Header), nil
428}
429
430var wordDecoder = mime.WordDecoder{
431 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
432 switch strings.ToLower(charset) {
433 case "", "us-ascii", "utf-8":
434 return r, nil
435 }
436 enc, _ := ianaindex.MIME.Encoding(charset)
437 if enc == nil {
438 enc, _ = ianaindex.IANA.Encoding(charset)
439 }
440 if enc == nil {
441 return r, fmt.Errorf("unknown charset %q", charset)
442 }
443 return enc.NewDecoder().Reader(r), nil
444 },
445}
446
447func parseEnvelope(log *mlog.Log, h mail.Header) (*Envelope, error) {
448 date, _ := h.Date()
449
450 // We currently marshal this field to JSON. But JSON cannot represent all
451 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme
452 // years, but we can readjust timezones.
453 // todo: remove this once we no longer store using json.
454 _, offset := date.Zone()
455 if date.Year() > 9999 {
456 date = time.Time{}
457 } else if offset <= -24*3600 || offset >= 24*3600 {
458 date = time.Unix(date.Unix(), 0).UTC()
459 }
460
461 subject := h.Get("Subject")
462 if s, err := wordDecoder.DecodeHeader(subject); err == nil {
463 subject = s
464 }
465
466 env := &Envelope{
467 date,
468 subject,
469 parseAddressList(log, h, "from"),
470 parseAddressList(log, h, "sender"),
471 parseAddressList(log, h, "reply-to"),
472 parseAddressList(log, h, "to"),
473 parseAddressList(log, h, "cc"),
474 parseAddressList(log, h, "bcc"),
475 h.Get("In-Reply-To"),
476 h.Get("Message-Id"),
477 }
478 return env, nil
479}
480
481func parseAddressList(log *mlog.Log, h mail.Header, k string) []Address {
482 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382
483 l, err := h.AddressList(k)
484 if err != nil {
485 return nil
486 }
487 var r []Address
488 for _, a := range l {
489 // todo: parse more fully according to ../rfc/5322:959
490 var user, host string
491 addr, err := smtp.ParseAddress(a.Address)
492 if err != nil {
493 // todo: pass a ctx to this function so we can log with cid.
494 log.Infox("parsing address (continuing)", err, mlog.Field("address", a.Address))
495 } else {
496 user = addr.Localpart.String()
497 host = addr.Domain.ASCII
498 }
499 r = append(r, Address{a.Name, user, host})
500 }
501 return r
502}
503
504// ParseNextPart parses the next (sub)part of this multipart message.
505// ParseNextPart returns io.EOF and a nil part when there are no more parts.
506// Only used for initial parsing of message. Once parsed, use p.Parts.
507func (p *Part) ParseNextPart(log *mlog.Log) (*Part, error) {
508 if len(p.bound) == 0 {
509 return nil, errNotMultipart
510 }
511 if p.nextBoundOffset == -1 {
512 if enforceSequential {
513 panic("access not sequential")
514 }
515 // Set nextBoundOffset by fully reading the last part.
516 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)
517 if err != nil {
518 return nil, err
519 }
520 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {
521 return nil, err
522 }
523 if p.nextBoundOffset == -1 {
524 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")
525 }
526 }
527 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}
528 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.
529 line, crlf, err := b.ReadLine(false)
530 if err != nil {
531 return nil, err
532 }
533 if match, finish := checkBound(line, p.bound); !match {
534 return nil, fmt.Errorf("expected bound, got %q", line)
535 } else if finish {
536 // Read any trailing data.
537 if p.parent != nil {
538 for {
539 line, _, err := b.PeekLine(false)
540 if err != nil {
541 break
542 }
543 if match, _ := checkBound(line, p.parent.bound); match {
544 break
545 }
546 b.ReadLine(false)
547 }
548 if p.parent.lastBoundOffset == p.BoundaryOffset {
549 p.parent.nextBoundOffset = b.offset
550 }
551 }
552 p.EndOffset = b.offset
553 return nil, io.EOF
554 } else if !crlf {
555 return nil, fmt.Errorf("non-finishing bound without crlf: %w", errUnexpectedEOF)
556 }
557 boundOffset := p.nextBoundOffset
558 p.lastBoundOffset = boundOffset
559 p.nextBoundOffset = -1
560 np, err := newPart(log, p.strict, p.r, boundOffset, p)
561 if err != nil {
562 return nil, err
563 }
564 p.Parts = append(p.Parts, np)
565 return &p.Parts[len(p.Parts)-1], nil
566}
567
568// Reader returns a reader for the decoded body content.
569func (p *Part) Reader() io.Reader {
570 return p.bodyReader(p.RawReader())
571}
572
573// ReaderUTF8OrBinary returns a reader for the decode body content, transformed to
574// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
575// already). For unknown or missing character sets/encodings, the original reader
576// is returned.
577func (p *Part) ReaderUTF8OrBinary() io.Reader {
578 return moxio.DecodeReader(p.ContentTypeParams["charset"], p.Reader())
579}
580
581func (p *Part) bodyReader(r io.Reader) io.Reader {
582 r = newDecoder(p.ContentTransferEncoding, r)
583 if p.MediaType == "TEXT" {
584 return &textReader{p, bufio.NewReader(r), 0, false}
585 }
586 return &countReader{p, r, 0}
587}
588
589// countReader is an io.Reader that passes Reads to the underlying reader.
590// when eof is read, it sets p.DecodedSize to the number of bytes returned.
591type countReader struct {
592 p *Part
593 r io.Reader
594 count int64
595}
596
597func (cr *countReader) Read(buf []byte) (int, error) {
598 n, err := cr.r.Read(buf)
599 if n >= 0 {
600 cr.count += int64(n)
601 }
602 if err == io.EOF {
603 cr.p.DecodedSize = cr.count
604 }
605 return n, err
606}
607
608// textReader is an io.Reader that ensures all lines return end in CRLF.
609// when eof is read from the underlying reader, it sets p.DecodedSize.
610type textReader struct {
611 p *Part
612 r *bufio.Reader
613 count int64
614 prevcr bool // If previous byte returned was a CR.
615}
616
617func (tr *textReader) Read(buf []byte) (int, error) {
618 o := 0
619 for o < len(buf) {
620 c, err := tr.r.ReadByte()
621 if err != nil {
622 tr.count += int64(o)
623 tr.p.DecodedSize = tr.count
624 return o, err
625 }
626 if c == '\n' && !tr.prevcr {
627 buf[o] = '\r'
628 o++
629 tr.prevcr = true
630 tr.r.UnreadByte()
631 continue
632 }
633 buf[o] = c
634 tr.prevcr = c == '\r'
635 o++
636 }
637 tr.count += int64(o)
638 return o, nil
639}
640
641func newDecoder(cte string, r io.Reader) io.Reader {
642 // ../rfc/2045:775
643 switch cte {
644 case "BASE64":
645 return base64.NewDecoder(base64.StdEncoding, r)
646 case "QUOTED-PRINTABLE":
647 return quotedprintable.NewReader(r)
648 }
649 return r
650}
651
652// RawReader returns a reader for the raw, undecoded body content. E.g. with
653// quoted-printable or base64 content intact.
654// Fully reading a part helps its parent part find its next part efficiently.
655func (p *Part) RawReader() io.Reader {
656 if p.r == nil {
657 panic("missing reader")
658 }
659 if p.EndOffset >= 0 {
660 return &crlfReader{strict: p.strict, r: io.NewSectionReader(p.r, p.BodyOffset, p.EndOffset-p.BodyOffset)}
661 }
662 p.RawLineCount = 0
663 if p.parent == nil {
664 return &offsetReader{p, p.BodyOffset, p.strict, true, false, 0}
665 }
666 return &boundReader{p: p, b: &bufAt{strict: p.strict, r: p.r, offset: p.BodyOffset}, prevlf: true}
667}
668
669// crlfReader verifies there are no bare newlines and optionally no bare carriage returns.
670type crlfReader struct {
671 r io.Reader
672 strict bool
673 prevcr bool
674}
675
676func (r *crlfReader) Read(buf []byte) (int, error) {
677 n, err := r.r.Read(buf)
678 if err == nil || err == io.EOF {
679 for _, b := range buf[:n] {
680 if b == '\n' && !r.prevcr {
681 err = errBareLF
682 break
683 } else if b != '\n' && r.prevcr && (r.strict || moxvar.Pedantic) {
684 err = errBareCR
685 break
686 }
687 r.prevcr = b == '\r'
688 }
689 }
690 return n, err
691}
692
693// bufAt is a buffered reader on an underlying ReaderAt.
694// bufAt verifies that lines end with crlf.
695type bufAt struct {
696 offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
697
698 strict bool
699 r io.ReaderAt
700 buf []byte // Buffered data.
701 nbuf int // Valid bytes in buf.
702 scratch []byte
703}
704
705// Messages should not have lines longer than 78+2 bytes, and must not have
706// lines longer than 998+2 bytes. But in practice they have longer lines. We
707// have a higher limit, but for when parsing with strict we check for the 1000
708// bytes limit.
709// ../rfc/5321:3512
710const maxLineLength = 8 * 1024
711
712func (b *bufAt) maxLineLength() int {
713 if b.strict || moxvar.Pedantic {
714 return 1000
715 }
716 return maxLineLength
717}
718
719// ensure makes sure b.nbuf is up to maxLineLength, unless eof is encountered.
720func (b *bufAt) ensure() error {
721 for _, c := range b.buf[:b.nbuf] {
722 if c == '\n' {
723 return nil
724 }
725 }
726 if b.scratch == nil {
727 b.scratch = make([]byte, b.maxLineLength())
728 }
729 if b.buf == nil {
730 b.buf = make([]byte, b.maxLineLength())
731 }
732 for b.nbuf < b.maxLineLength() {
733 n, err := b.r.ReadAt(b.buf[b.nbuf:], b.offset+int64(b.nbuf))
734 if n > 0 {
735 b.nbuf += n
736 }
737 if err != nil && err != io.EOF || err == io.EOF && b.nbuf+n == 0 {
738 return err
739 }
740 if n == 0 || err == io.EOF {
741 break
742 }
743 }
744 return nil
745}
746
747// ReadLine reads a line until \r\n is found, returning the line including \r\n.
748// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
749func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
750 return b.line(true, requirecrlf)
751}
752
753func (b *bufAt) PeekLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
754 return b.line(false, requirecrlf)
755}
756
757func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err error) {
758 if err := b.ensure(); err != nil {
759 return nil, false, err
760 }
761 for i, c := range b.buf[:b.nbuf] {
762 if c == '\n' {
763 // Should have seen a \r, which should have been handled below.
764 return nil, false, errBareLF
765 }
766 if c != '\r' {
767 continue
768 }
769 i++
770 if i >= b.nbuf || b.buf[i] != '\n' {
771 if b.strict || moxvar.Pedantic {
772 return nil, false, errBareCR
773 }
774 continue
775 }
776 b.scratch = b.scratch[:i+1]
777 copy(b.scratch, b.buf[:i+1])
778 if consume {
779 copy(b.buf, b.buf[i+1:])
780 b.offset += int64(i + 1)
781 b.nbuf -= i + 1
782 }
783 return b.scratch, true, nil
784 }
785 if b.nbuf >= b.maxLineLength() {
786 return nil, false, errLineTooLong
787 }
788 if requirecrlf {
789 return nil, false, errUnexpectedEOF
790 }
791 b.scratch = b.scratch[:b.nbuf]
792 copy(b.scratch, b.buf[:b.nbuf])
793 if consume {
794 b.offset += int64(b.nbuf)
795 b.nbuf = 0
796 }
797 return b.scratch, false, nil
798}
799
800// PeekByte returns the next unread byte, or an error.
801func (b *bufAt) PeekByte() (byte, error) {
802 if err := b.ensure(); err != nil {
803 return 0, err
804 }
805 if b.nbuf == 0 {
806 return 0, io.EOF
807 }
808 return b.buf[0], nil
809}
810
811// offsetReader reads from p.r starting from offset, and RawLineCount on p.
812// offsetReader validates lines end with \r\n.
813type offsetReader struct {
814 p *Part
815 offset int64
816 strict bool
817 prevlf bool
818 prevcr bool
819 linelength int
820}
821
822func (r *offsetReader) Read(buf []byte) (int, error) {
823 n, err := r.p.r.ReadAt(buf, r.offset)
824 if n > 0 {
825 r.offset += int64(n)
826 max := maxLineLength
827 if r.strict || moxvar.Pedantic {
828 max = 1000
829 }
830
831 for _, c := range buf[:n] {
832 if r.prevlf {
833 r.p.RawLineCount++
834 }
835 if err == nil || err == io.EOF {
836 if c == '\n' && !r.prevcr {
837 err = errBareLF
838 } else if c != '\n' && r.prevcr && (r.strict || moxvar.Pedantic) {
839 err = errBareCR
840 }
841 }
842 r.prevlf = c == '\n'
843 r.prevcr = c == '\r'
844 r.linelength++
845 if c == '\n' {
846 r.linelength = 0
847 } else if r.linelength > max && err == nil {
848 err = errLineTooLong
849 }
850 }
851 }
852 if err == io.EOF {
853 r.p.EndOffset = r.offset
854 }
855 return n, err
856}
857
858var crlf = []byte("\r\n")
859
860// boundReader is a reader that stops at a closing multipart boundary.
861// boundReader ensures lines end with crlf through its use of bufAt.
862type boundReader struct {
863 p *Part
864 b *bufAt
865 buf []byte // Data from previous line, to be served first.
866 nbuf int // Number of valid bytes in buf.
867 crlf []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
868 prevlf bool // If last char returned was a newline. For counting lines.
869}
870
871func (b *boundReader) Read(buf []byte) (count int, rerr error) {
872 origBuf := buf
873 defer func() {
874 if count > 0 {
875 for _, c := range origBuf[:count] {
876 if b.prevlf {
877 b.p.RawLineCount++
878 }
879 b.prevlf = c == '\n'
880 }
881 }
882 }()
883
884 for {
885 // Read data from earlier line.
886 if b.nbuf > 0 {
887 n := b.nbuf
888 if n > len(buf) {
889 n = len(buf)
890 }
891 copy(buf, b.buf[:n])
892 copy(b.buf, b.buf[n:])
893 buf = buf[n:]
894 b.nbuf -= n
895 count += n
896 if b.nbuf > 0 {
897 break
898 }
899 }
900
901 // Look at next line. If it is a boundary, we are done and won't serve the crlf from the last line.
902 line, _, err := b.b.PeekLine(false)
903 if match, _ := checkBound(line, b.p.parent.bound); match {
904 b.p.EndOffset = b.b.offset - int64(len(b.crlf))
905 if b.p.parent.lastBoundOffset == b.p.BoundaryOffset {
906 b.p.parent.nextBoundOffset = b.b.offset
907 } else if enforceSequential {
908 panic("access not sequential")
909 }
910 return count, io.EOF
911 }
912 if err == io.EOF {
913 err = errMissingClosingBoundary
914 }
915 if err != nil && err != io.EOF {
916 return count, err
917 }
918 if len(b.crlf) > 0 {
919 n := len(b.crlf)
920 if n > len(buf) {
921 n = len(buf)
922 }
923 copy(buf, b.crlf[:n])
924 count += n
925 buf = buf[n:]
926 b.crlf = b.crlf[n:]
927 }
928 if len(buf) == 0 {
929 break
930 }
931 line, _, err = b.b.ReadLine(true)
932 if err != nil {
933 // Could be an unexpected end of the part.
934 return 0, err
935 }
936 b.crlf = crlf // crlf will be read next time, but not if a boundary follows.
937 n := len(line) - 2
938 line = line[:n]
939 if n > len(buf) {
940 n = len(buf)
941 }
942 copy(buf, line[:n])
943 count += n
944 buf = buf[n:]
945 line = line[n:]
946 if len(line) > 0 {
947 if b.buf == nil {
948 b.buf = make([]byte, b.b.maxLineLength())
949 }
950 copy(b.buf, line)
951 b.nbuf = len(line)
952 }
953 }
954 return count, nil
955}
956
957func checkBound(line, bound []byte) (bool, bool) {
958 if !bytes.HasPrefix(line, bound) {
959 return false, false
960 }
961 line = line[len(bound):]
962 if bytes.HasPrefix(line, []byte("--")) {
963 return true, true
964 }
965 if len(line) == 0 {
966 return true, false
967 }
968 c := line[0]
969 switch c {
970 case ' ', '\t', '\r', '\n':
971 return true, false
972 }
973 return false, false
974}
975