1package message

3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").

4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...

5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.

6// todo: handle comments in headers?

7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.

8// todo: can use a cleanup

10import (

11 "bufio"

12 "bytes"

13 "encoding/base64"

14 "errors"

15 "fmt"

16 "io"

17 "mime"

18 "mime/quotedprintable"

19 "net/mail"

20 "net/textproto"

21 "strings"

22 "time"

24 "golang.org/x/text/encoding/ianaindex"

26 "github.com/mjl-/mox/mlog"

27 "github.com/mjl-/mox/moxio"

28 "github.com/mjl-/mox/moxvar"

29 "github.com/mjl-/mox/smtp"

30)

32var (

33 ErrBadContentType = errors.New("bad content-type")

34)

36var (

37 errNotMultipart = errors.New("not a multipart message")

38 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")

39 errLineTooLong = errors.New("line too long")

40 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")

41 errMissingClosingBoundary = errors.New("eof without closing boundary")

42 errBareLF = errors.New("invalid bare line feed")

43 errBareCR = errors.New("invalid bare carriage return")

44 errUnexpectedEOF = errors.New("unexpected eof")

45)

47// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.

48var enforceSequential bool

50// Part represents a whole mail message, or a part of a multipart message. It

51// is designed to handle IMAP requirements efficiently.

52type Part struct {

53 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.

54 HeaderOffset int64 // Offset in message file where header starts.

55 BodyOffset int64 // Offset in message file where body starts.

56 EndOffset int64 // Where body of part ends. Set when part is fully read.

57 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.

58 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.

60 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.

61 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".

62 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.

63 ContentID string

64 ContentDescription string

65 ContentTransferEncoding string // In upper case.

66 Envelope *Envelope // Email message headers. Not for non-message parts.

68 Parts []Part // Parts if this is a multipart.

70 // Only for message/rfc822 and message/global. This part may have a buffer as

71 // backing io.ReaderAt, because a message/global can have a non-identity

72 // content-transfer-encoding. This part has a nil parent.

73 Message *Part

75 r io.ReaderAt

76 header textproto.MIMEHeader // Parsed header.

77 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.

78 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.

79 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.

80 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.

81 strict bool // If set, valid crlf line endings are verified when reading body.

82}

84// todo: have all Content* fields in Part?

85// todo: make Address contain a type Localpart and dns.Domain?

86// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.

88// Envelope holds the basic/common message headers as used in IMAP4.

89type Envelope struct {

90 Date time.Time

91 Subject string // Q/B-word-decoded.

92 From []Address

93 Sender []Address

94 ReplyTo []Address

95 To []Address

96 CC []Address

97 BCC []Address

98 InReplyTo string

99 MessageID string

100}

101

102// Address as used in From and To headers.

103type Address struct {

104 Name string // Free-form name for display in mail applications.

105 User string // Localpart.

106 Host string // Domain in ASCII.

107}

108

109// Parse reads the headers of the mail message and returns a part.

110// A part provides access to decoded and raw contents of a message and its multiple parts.

111//

112// If strict is set, fewer attempts are made to continue parsing when errors are

113// encountered, such as with invalid content-type headers or bare carriage returns.

114func Parse(log *mlog.Log, strict bool, r io.ReaderAt) (Part, error) {

115 return newPart(log, strict, r, 0, nil)

116}

117

118// EnsurePart parses a part as with Parse, but ensures a usable part is always

119// returned, even if error is non-nil. If a parse error occurs, the message is

120// returned as application/octet-stream, and headers can still be read if they

121// were valid.

122//

123// If strict is set, fewer attempts are made to continue parsing when errors are

124// encountered, such as with invalid content-type headers or bare carriage returns.

125func EnsurePart(log *mlog.Log, strict bool, r io.ReaderAt, size int64) (Part, error) {

126 p, err := Parse(log, strict, r)

127 if err == nil {

128 err = p.Walk(log, nil)

129 }

130 if err != nil {

131 np, err2 := fallbackPart(p, r, size)

132 if err2 != nil {

133 err = err2

134 }

135 p = np

136 }

137 return p, err

138}

139

140func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {

141 np := Part{

142 HeaderOffset: p.HeaderOffset,

143 BodyOffset: p.BodyOffset,

144 EndOffset: size,

145 MediaType: "APPLICATION",

146 MediaSubType: "OCTET-STREAM",

147 ContentTypeParams: p.ContentTypeParams,

148 ContentID: p.ContentID,

149 ContentDescription: p.ContentDescription,

150 ContentTransferEncoding: p.ContentTransferEncoding,

151 Envelope: p.Envelope,

152 // We don't keep:

153 // - BoundaryOffset: irrelevant for top-level message.

154 // - RawLineCount and DecodedSize: set below.

155 // - Parts: we are not treating this as a multipart message.

156 }

157 np.SetReaderAt(r)

158 // By reading body, the number of lines and decoded size will be set.

159 _, err := io.Copy(io.Discard, np.Reader())

160 return np, err

161}

162

163// SetReaderAt sets r as reader for this part and all its sub parts, recursively.

164// No reader is set for any Message subpart, see SetMessageReaderAt.

165func (p *Part) SetReaderAt(r io.ReaderAt) {

166 if r == nil {

167 panic("nil reader")

168 }

169 p.r = r

170 for i := range p.Parts {

171 pp := &p.Parts[i]

172 pp.SetReaderAt(r)

173 }

174}

175

176// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.

177func (p *Part) SetMessageReaderAt() error {

178 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.

179 buf, err := io.ReadAll(p.Reader())

180 if err != nil {

181 return err

182 }

183 p.Message.SetReaderAt(bytes.NewReader(buf))

184 return nil

185}

186

187// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.

188func (p *Part) Walk(log *mlog.Log, parent *Part) error {

189 if len(p.bound) == 0 {

190 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {

191 // todo: don't read whole submessage in memory...

192 buf, err := io.ReadAll(p.Reader())

193 if err != nil {

194 return err

195 }

196 br := bytes.NewReader(buf)

197 mp, err := Parse(log, p.strict, br)

198 if err != nil {

199 return fmt.Errorf("parsing embedded message: %w", err)

200 }

201 if err := mp.Walk(log, nil); err != nil {

202 // If this is a DSN and we are not in pedantic mode, accept unexpected end of

203 // message. This is quite common because MTA's sometimes just truncate the original

204 // message in a place that makes the message invalid.

205 if errors.Is(err, errUnexpectedEOF) && !moxvar.Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {

206 mp, err = fallbackPart(mp, br, int64(len(buf)))

207 if err != nil {

208 return fmt.Errorf("parsing invalid embedded message: %w", err)

209 }

210 } else {

211 return fmt.Errorf("parsing parts of embedded message: %w", err)

212 }

213 }

214 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.

215 p.Message = &mp

216 return nil

217 }

218 _, err := io.Copy(io.Discard, p.Reader())

219 return err

220 }

221

222 for {

223 pp, err := p.ParseNextPart(log)

224 if err == io.EOF {

225 return nil

226 }

227 if err != nil {

228 return err

229 }

230 if err := pp.Walk(log, p); err != nil {

231 return err

232 }

233 }

234}

235

236// String returns a debugging representation of the part.

237func (p *Part) String() string {

238 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)

239}

240

241// newPart parses a new part, which can be the top-level message.

242// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.

243// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.

244func newPart(log *mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {

245 if r == nil {

246 panic("nil reader")

247 }

248 p = Part{

249 BoundaryOffset: -1,

250 EndOffset: -1,

251 r: r,

252 parent: parent,

253 strict: strict,

254 }

255

256 b := &bufAt{strict: strict, r: r, offset: offset}

257

258 if parent != nil {

259 p.BoundaryOffset = offset

260 if line, _, err := b.ReadLine(true); err != nil {

261 return p, err

262 } else if match, finish := checkBound(line, parent.bound); !match {

263 return p, fmt.Errorf("missing bound")

264 } else if finish {

265 return p, fmt.Errorf("new part for closing boundary")

266 }

267 }

268

269 // Collect header.

270 p.HeaderOffset = b.offset

271 p.BodyOffset = b.offset

272 hb := &bytes.Buffer{}

273 for {

274 line, _, err := b.ReadLine(true)

275 if err == io.EOF {

276 // No body is valid.

277 break

278 }

279 if err != nil {

280 return p, fmt.Errorf("reading header line: %w", err)

281 }

282 hb.Write(line)

283 if len(line) == 2 {

284 break // crlf

285 }

286 }

287 p.BodyOffset = b.offset

288

289 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.

290 if p.HeaderOffset == p.BodyOffset {

291 p.header = textproto.MIMEHeader{}

292 } else {

293 h, err := parseHeader(hb)

294 if err != nil {

295 return p, fmt.Errorf("parsing header: %w", err)

296 }

297 p.header = h

298 }

299

300 ct := p.header.Get("Content-Type")

301 mt, params, err := mime.ParseMediaType(ct)

302 if err != nil && ct != "" {

303 if moxvar.Pedantic || strict {

304 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)

305 }

306

307 // Try parsing just a content-type, ignoring parameters.

308 // ../rfc/2045:628

309 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])

310 t := strings.SplitN(ct, "/", 2)

311 isToken := func(s string) bool {

312 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663

313 for _, c := range s {

314 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {

315 return false

316 }

317 }

318 return len(s) > 0

319 }

320 // We cannot recover content-type of multipart, we won't have a boundary.

321 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {

322 p.MediaType = strings.ToUpper(t[0])

323 p.MediaSubType = strings.ToUpper(t[1])

324 } else {

325 p.MediaType = "APPLICATION"

326 p.MediaSubType = "OCTET-STREAM"

327 }

328 log.Debugx("malformed content-type, attempting to recover and continuing", err, mlog.Field("contenttype", p.header.Get("Content-Type")), mlog.Field("mediatype", p.MediaType), mlog.Field("mediasubtype", p.MediaSubType))

329 } else if mt != "" {

330 t := strings.SplitN(strings.ToUpper(mt), "/", 2)

331 if len(t) != 2 {

332 if moxvar.Pedantic || strict {

333 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)

334 }

335 log.Debug("malformed media-type, ignoring and continuing", mlog.Field("type", mt))

336 p.MediaType = "APPLICATION"

337 p.MediaSubType = "OCTET-STREAM"

338 } else {

339 p.MediaType = t[0]

340 p.MediaSubType = t[1]

341 p.ContentTypeParams = params

342 }

343 }

344

345 p.ContentID = p.header.Get("Content-Id")

346 p.ContentDescription = p.header.Get("Content-Description")

347 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))

348

349 if parent == nil {

350 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))

351 if err != nil {

352 return p, err

353 }

354 }

355

356 if p.MediaType == "MULTIPART" {

357 s := params["boundary"]

358 if s == "" {

359 return p, errMissingBoundaryParam

360 }

361 p.bound = append([]byte("--"), s...)

362

363 // Discard preamble, before first boundary.

364 for {

365 line, _, err := b.PeekLine(true)

366 if err != nil {

367 return p, fmt.Errorf("parsing line for part preamble: %w", err)

368 }

369 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103

370 // Well, for compatibility, we require whitespace after the boundary. Because some

371 // software use the same boundary but with text appended for sub parts.

372 if match, finish := checkBound(line, p.bound); match {

373 if finish {

374 return p, errFirstBoundCloses

375 }

376 break

377 }

378 b.ReadLine(true)

379 }

380 p.nextBoundOffset = b.offset

381 p.lastBoundOffset = b.offset

382 }

383

384 return p, nil

385}

386

387// Header returns the parsed header of this part.

388func (p *Part) Header() (textproto.MIMEHeader, error) {

389 if p.header != nil {

390 return p.header, nil

391 }

392 if p.HeaderOffset == p.BodyOffset {

393 p.header = textproto.MIMEHeader{}

394 return p.header, nil

395 }

396 h, err := parseHeader(p.HeaderReader())

397 p.header = h

398 return h, err

399}

400

401// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.

402func (p *Part) HeaderReader() io.Reader {

403 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)

404}

405

406// parse a header, only call this on non-empty input (even though that is a valid header).

407func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {

408 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the

409 // first handles email messages properly, while the second only works for HTTP

410 // headers.

411 var zero textproto.MIMEHeader

412

413 // We read the header and add the optional \r\n header/body separator. If the \r\n

414 // is missing, parsing with Go <1.21 results in an EOF error.

415 // todo: directly parse from reader r when Go 1.20 is no longer supported.

416 buf, err := io.ReadAll(r)

417 if err != nil {

418 return zero, err

419 }

420 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {

421 buf = append(buf, "\r\n"...)

422 }

423 msg, err := mail.ReadMessage(bytes.NewReader(buf))

424 if err != nil {

425 return zero, err

426 }

427 return textproto.MIMEHeader(msg.Header), nil

428}

429

430var wordDecoder = mime.WordDecoder{

431 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {

432 switch strings.ToLower(charset) {

433 case "", "us-ascii", "utf-8":

434 return r, nil

435 }

436 enc, _ := ianaindex.MIME.Encoding(charset)

437 if enc == nil {

438 enc, _ = ianaindex.IANA.Encoding(charset)

439 }

440 if enc == nil {

441 return r, fmt.Errorf("unknown charset %q", charset)

442 }

443 return enc.NewDecoder().Reader(r), nil

444 },

445}

446

447func parseEnvelope(log *mlog.Log, h mail.Header) (*Envelope, error) {

448 date, _ := h.Date()

449

450 // We currently marshal this field to JSON. But JSON cannot represent all

451 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme

452 // years, but we can readjust timezones.

453 // todo: remove this once we no longer store using json.

454 _, offset := date.Zone()

455 if date.Year() > 9999 {

456 date = time.Time{}

457 } else if offset <= -24*3600 || offset >= 24*3600 {

458 date = time.Unix(date.Unix(), 0).UTC()

459 }

460

461 subject := h.Get("Subject")

462 if s, err := wordDecoder.DecodeHeader(subject); err == nil {

463 subject = s

464 }

465

466 env := &Envelope{

467 date,

468 subject,

469 parseAddressList(log, h, "from"),

470 parseAddressList(log, h, "sender"),

471 parseAddressList(log, h, "reply-to"),

472 parseAddressList(log, h, "to"),

473 parseAddressList(log, h, "cc"),

474 parseAddressList(log, h, "bcc"),

475 h.Get("In-Reply-To"),

476 h.Get("Message-Id"),

477 }

478 return env, nil

479}

480

481func parseAddressList(log *mlog.Log, h mail.Header, k string) []Address {

482 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382

483 l, err := h.AddressList(k)

484 if err != nil {

485 return nil

486 }

487 var r []Address

488 for _, a := range l {

489 // todo: parse more fully according to ../rfc/5322:959

490 var user, host string

491 addr, err := smtp.ParseAddress(a.Address)

492 if err != nil {

493 log.Infox("parsing address (continuing)", err, mlog.Field("address", a.Address))

494 } else {

495 user = addr.Localpart.String()

496 host = addr.Domain.ASCII

497 }

498 r = append(r, Address{a.Name, user, host})

499 }

500 return r

501}

502

503// ParseNextPart parses the next (sub)part of this multipart message.

504// ParseNextPart returns io.EOF and a nil part when there are no more parts.

505// Only used for initial parsing of message. Once parsed, use p.Parts.

506func (p *Part) ParseNextPart(log *mlog.Log) (*Part, error) {

507 if len(p.bound) == 0 {

508 return nil, errNotMultipart

509 }

510 if p.nextBoundOffset == -1 {

511 if enforceSequential {

512 panic("access not sequential")

513 }

514 // Set nextBoundOffset by fully reading the last part.

515 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)

516 if err != nil {

517 return nil, err

518 }

519 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {

520 return nil, err

521 }

522 if p.nextBoundOffset == -1 {

523 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")

524 }

525 }

526 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}

527 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.

528 line, crlf, err := b.ReadLine(false)

529 if err != nil {

530 return nil, err

531 }

532 if match, finish := checkBound(line, p.bound); !match {

533 return nil, fmt.Errorf("expected bound, got %q", line)

534 } else if finish {

535 // Read any trailing data.

536 if p.parent != nil {

537 for {

538 line, _, err := b.PeekLine(false)

539 if err != nil {