1package smtpserver
2
3import (
4 "errors"
5 "fmt"
6 "time"
7
8 "github.com/mjl-/bstore"
9
10 "github.com/mjl-/mox/mlog"
11 "github.com/mjl-/mox/smtp"
12 "github.com/mjl-/mox/store"
13)
14
15type reputationMethod string
16
17const (
18 methodMsgfromFull reputationMethod = "msgfromfull"
19 methodMsgtoFull reputationMethod = "msgtofull"
20 methodMsgfromDomain reputationMethod = "msgfromdomain"
21 methodMsgfromOrgDomain reputationMethod = "msgfromorgdomain"
22 methodMsgtoDomain reputationMethod = "msgtodomain"
23 methodMsgtoOrgDomain reputationMethod = "msgtoorgdomain"
24 methodDKIMSPF reputationMethod = "dkimspf"
25 methodIP1 reputationMethod = "ip1"
26 methodIP2 reputationMethod = "ip2"
27 methodIP3 reputationMethod = "ip3"
28 methodNone reputationMethod = "none"
29)
30
31// Reputation returns whether message m is likely junk.
32//
33// This function is called after checking for a manually configured spf mailfrom
34// allow (e.g. for mailing lists), and after checking for a dmarc reject policy.
35//
36// The decision is made based on historic messages delivered to the same
37// destination mailbox, MailboxOrigID. Because each mailbox may have a different
38// accept policy. We only use messages that have been marked as either junk or
39// non-junk. We help users by automatically marking them as non-junk when moving to
40// certain folders in the default config (e.g. the archive folder). We expect users
41// to mark junk messages as such when they read it. And to keep it in their inbox,
42// regular trash or archive if it is not.
43//
44// The basic idea is to keep accepting messages that were accepted in the past, and
45// keep rejecting those that were rejected. This is relatively easy to check if
46// mail passes SPF and/or DKIM with Message-From alignment. Regular email from
47// known people will be let in. But spammers are trickier. They will use new IPs,
48// (sub)domains, no or newly created SPF and/or DKIM identifiers, new localparts,
49// etc. This function likely ends up returning "inconclusive" for such emails. The
50// junkfilter will have to take care of a final decision.
51//
52// In case of doubt, it doesn't hurt much to accept another mail that a user has
53// communicated successfully with in the past. If the most recent message is marked
54// as junk that could have happened accidentally. If another message is let in, and
55// it is again junk, future messages will be rejected.
56//
57// Actual spammers will probably try to use identifiers, i.e. (sub)domain, dkim/spf
58// identifiers and ip addresses for which we have no history. We may only have
59// ip-based reputation, perhaps only an ip range, perhaps nothing.
60//
61// Some profiles of first-time senders:
62//
63// - Individuals. They can typically get past the junkfilter if needed.
64// - Transactional emails. They should get past the junkfilter. If they use one of
65// the larger email service providers, their reputation could help. If the
66// junkfilter rejects the message, users can recover the message from the Rejects
67// mailbox. The first message is typically initiated by a user, e.g. by registering.
68// - Desired commercial email will have to get past the junkfilter based on its
69// content. There will typically be earlier communication with the (organizational)
70// domain that would let the message through.
71// - Mailing list. May get past the junkfilter. If delivery is to a separate
72// mailbox, the junkfilter will let it in because of little history. Long enough to
73// build reputation based on DKIM/SPF signals. Users are best off to
74// configure accept rules for messages from mailing lists.
75//
76// The decision-making process looks at historic messages. The following properties
77// are checked until matching messages are found. If they are found, a decision is
78// returned, which may be inconclusive. The next property on the list is only
79// checked if a step did not match any messages.
80//
81// - Messages matching full "message from" address, either with strict/relaxed
82// dkim/spf-verification, or without.
83// - Messages the user sent to the "message from" address.
84// - Messages matching only the domain of the "message from" address (different
85// localpart), again with verification or without.
86// - Messages sent to an address in the domain of the "message from" address.
87// - The previous two checks again, but now checking against the organizational
88// domain instead of the exact domain.
89// - Matching DKIM domains and a matching SPF mailfrom, or mailfrom domain, or ehlo
90// domain.
91// - "Exact" IP, or nearby IPs.
92//
93// References:
94// ../rfc/5863
95// ../rfc/7960
96// ../rfc/6376:1915
97// ../rfc/6376:3716
98// ../rfc/7208:2167
99func reputation(tx *bstore.Tx, log *mlog.Log, m *store.Message) (rjunk *bool, rconclusive bool, rmethod reputationMethod, rerr error) {
100 boolptr := func(v bool) *bool {
101 return &v
102 }
103 xfalse := boolptr(false)
104 xtrue := boolptr(true)
105
106 type queryError string
107
108 defer func() {
109 x := recover()
110 if x == nil {
111 return
112 }
113 if xerr, ok := x.(queryError); ok {
114 rerr = errors.New(string(xerr))
115 return
116 }
117 panic(x)
118 }()
119
120 now := time.Now()
121
122 // messageQuery returns a base query for historic seen messages to the same
123 // mailbox, at most maxAge old, and at most maxCount messages.
124 messageQuery := func(fm *store.Message, maxAge time.Duration, maxCount int) *bstore.Query[store.Message] {
125 q := bstore.QueryTx[store.Message](tx)
126 q.FilterEqual("MailboxOrigID", m.MailboxID)
127 q.FilterEqual("Expunged", false)
128 q.FilterFn(func(m store.Message) bool {
129 return m.Junk || m.Notjunk
130 })
131 if fm != nil {
132 q.FilterNonzero(*fm)
133 }
134 q.FilterGreaterEqual("Received", now.Add(-maxAge))
135 q.Limit(maxCount)
136 q.SortDesc("Received")
137 return q
138 }
139
140 // Execute the query, returning messages or returning error through panic.
141 xmessageList := func(q *bstore.Query[store.Message], descr string) []store.Message {
142 t0 := time.Now()
143 l, err := q.List()
144 log.Debugx("querying messages for reputation", err, mlog.Field("msgs", len(l)), mlog.Field("descr", descr), mlog.Field("queryduration", time.Since(t0)))
145 if err != nil {
146 panic(queryError(fmt.Sprintf("listing messages: %v", err)))
147 }
148 return l
149 }
150
151 xrecipientExists := func(q *bstore.Query[store.Recipient]) bool {
152 exists, err := q.Exists()
153 if err != nil {
154 panic(queryError(fmt.Sprintf("checking for recipient: %v", err)))
155 }
156 return exists
157 }
158
159 const year = 365 * 24 * time.Hour
160
161 // Look for historic messages with same "message from" address. We'll
162 // treat any validation (strict/dmarc/relaxed) the same, but "none"
163 // separately.
164 //
165 // We only need 1 message, and sometimes look at a second message. If
166 // the last message or the message before was an accept, we accept. If
167 // the single last or last two were a reject, we reject.
168 //
169 // If there was no validation, any signal is inconclusive.
170 if m.MsgFromDomain != "" {
171 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain}, 3*year, 2)
172 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
173 msgs := xmessageList(q, "mgsfromfull")
174 if len(msgs) > 0 {
175 // todo: we may want to look at dkim/spf in this case.
176 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
177 conclusive := m.MsgFromValidated
178 return &spam, conclusive, methodMsgfromFull, nil
179 }
180 if !m.MsgFromValidated {
181 // Look for historic messages that were validated. If present, this is likely spam.
182 // Only return as conclusively spam if history also says this From-address sent
183 // spam.
184 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain, MsgFromValidated: true}, 3*year, 2)
185 msgs = xmessageList(q, "msgfromfull-validated")
186 if len(msgs) > 0 {
187 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
188 return xtrue, spam, methodMsgfromFull, nil
189 }
190 }
191
192 // Look if we ever sent to this address. If so, we accept,
193 qr := bstore.QueryTx[store.Recipient](tx)
194 qr.FilterEqual("Localpart", m.MsgFromLocalpart)
195 qr.FilterEqual("Domain", m.MsgFromDomain)
196 qr.FilterGreaterEqual("Sent", now.Add(-3*year))
197 if xrecipientExists(qr) {
198 return xfalse, true, methodMsgtoFull, nil
199 }
200
201 // Look for domain match, then for organizational domain match.
202 for _, orgdomain := range []bool{false, true} {
203 qm := store.Message{}
204 var method reputationMethod
205 var descr string
206 if orgdomain {
207 qm.MsgFromOrgDomain = m.MsgFromOrgDomain
208 method = methodMsgfromOrgDomain
209 descr = "msgfromorgdomain"
210 } else {
211 qm.MsgFromDomain = m.MsgFromDomain
212 method = methodMsgfromDomain
213 descr = "msgfromdomain"
214 }
215
216 q := messageQuery(&qm, 2*year, 20)
217 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
218 msgs := xmessageList(q, descr)
219 if len(msgs) > 0 {
220 nonjunk := 0
221 for _, m := range msgs {
222 if !m.Junk {
223 nonjunk++
224 }
225 }
226 if 100*nonjunk/len(msgs) > 80 {
227 return xfalse, true, method, nil
228 }
229 if nonjunk == 0 {
230 // Only conclusive with at least 3 different localparts.
231 localparts := map[smtp.Localpart]struct{}{}
232 for _, m := range msgs {
233 localparts[m.MsgFromLocalpart] = struct{}{}
234 if len(localparts) == 3 {
235 return xtrue, true, method, nil
236 }
237 }
238 return xtrue, false, method, nil
239 }
240 // Mixed signals from domain. We don't want to block a new sender.
241 return nil, false, method, nil
242 }
243 if !m.MsgFromValidated {
244 // Look for historic messages that were validated. If present, this is likely spam.
245 // Only return as conclusively spam if history also says this From-address sent
246 // spam.
247 q := messageQuery(&qm, 2*year, 2)
248 q.FilterEqual("MsgFromValidated", true)
249 msgs = xmessageList(q, descr+"-validated")
250 if len(msgs) > 0 {
251 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
252 return xtrue, spam, method, nil
253 }
254 }
255
256 // Look if we ever sent to this address. If so, we accept,
257 qr := bstore.QueryTx[store.Recipient](tx)
258 if orgdomain {
259 qr.FilterEqual("OrgDomain", m.MsgFromOrgDomain)
260 method = methodMsgtoOrgDomain
261 } else {
262 qr.FilterEqual("Domain", m.MsgFromDomain)
263 method = methodMsgtoDomain
264 }
265 qr.FilterGreaterEqual("Sent", now.Add(-2*year))
266 if xrecipientExists(qr) {
267 return xfalse, true, method, nil
268 }
269 }
270 }
271
272 // DKIM and SPF.
273 // We only use identities that passed validation. Failed identities are ignored. ../rfc/6376:2447
274 // todo future: we could do something with the DKIM identity (i=) field if it is more specific than just the domain (d=).
275 dkimspfsignals := []float64{}
276 dkimspfmsgs := 0
277 for _, dom := range m.DKIMDomains {
278 q := messageQuery(nil, year/2, 50)
279 q.FilterIn("DKIMDomains", dom)
280 msgs := xmessageList(q, "dkimdomain")
281 if len(msgs) > 0 {
282 nspam := 0
283 for _, m := range msgs {
284 if m.Junk {
285 nspam++
286 }
287 }
288 pspam := float64(nspam) / float64(len(msgs))
289 dkimspfsignals = append(dkimspfsignals, pspam)
290 dkimspfmsgs = len(msgs)
291 }
292 }
293 if m.MailFromValidated || m.EHLOValidated {
294 var msgs []store.Message
295 if m.MailFromValidated && m.MailFromDomain != "" {
296 q := messageQuery(&store.Message{MailFromLocalpart: m.MailFromLocalpart, MailFromDomain: m.MailFromDomain}, year/2, 50)
297 msgs = xmessageList(q, "mailfrom")
298 if len(msgs) == 0 {
299 q := messageQuery(&store.Message{MailFromDomain: m.MailFromDomain}, year/2, 50)
300 msgs = xmessageList(q, "mailfromdomain")
301 }
302 }
303 if len(msgs) == 0 && m.EHLOValidated && m.EHLODomain != "" {
304 q := messageQuery(&store.Message{EHLODomain: m.EHLODomain}, year/2, 50)
305 msgs = xmessageList(q, "ehlodomain")
306 }
307 if len(msgs) > 0 {
308 nspam := 0
309 for _, m := range msgs {
310 if m.Junk {
311 nspam++
312 }
313 }
314 pspam := float64(nspam) / float64(len(msgs))
315 dkimspfsignals = append(dkimspfsignals, pspam)
316 if len(msgs) > dkimspfmsgs {
317 dkimspfmsgs = len(msgs)
318 }
319 }
320 }
321 if len(dkimspfsignals) > 0 {
322 var nham, nspam int
323 for _, p := range dkimspfsignals {
324 if p < .1 {
325 nham++
326 } else if p > .9 {
327 nspam++
328 }
329 }
330 if nham > 0 && nspam == 0 {
331 return xfalse, true, methodDKIMSPF, nil
332 }
333 if nspam > 0 && nham == 0 {
334 return xtrue, dkimspfmsgs > 1, methodDKIMSPF, nil
335 }
336 return nil, false, methodDKIMSPF, nil
337 }
338
339 // IP-based. A wider mask needs more messages to be conclusive.
340 // We require the resulting signal to be strong, i.e. likely ham or likely spam.
341 var msgs []store.Message
342 var need int
343 var method reputationMethod
344 if m.RemoteIPMasked1 != "" {
345 q := messageQuery(&store.Message{RemoteIPMasked1: m.RemoteIPMasked1}, year/4, 50)
346 msgs = xmessageList(q, "ip1")
347 need = 2
348 method = methodIP1
349 }
350 if len(msgs) == 0 && m.RemoteIPMasked2 != "" {
351 q := messageQuery(&store.Message{RemoteIPMasked2: m.RemoteIPMasked2}, year/4, 50)
352 msgs = xmessageList(q, "ip2")
353 need = 5
354 method = methodIP2
355 }
356 if len(msgs) == 0 && m.RemoteIPMasked3 != "" {
357 q := messageQuery(&store.Message{RemoteIPMasked3: m.RemoteIPMasked3}, year/4, 50)
358 msgs = xmessageList(q, "ip3")
359 need = 10
360 method = methodIP3
361 }
362 if len(msgs) > 0 {
363 nspam := 0
364 for _, m := range msgs {
365 if m.Junk {
366 nspam++
367 }
368 }
369 pspam := float64(nspam) / float64(len(msgs))
370 var spam *bool
371 if pspam < .25 {
372 spam = xfalse
373 } else if pspam > .75 {
374 spam = xtrue
375 }
376 conclusive := len(msgs) >= need && (pspam <= 0.1 || pspam >= 0.9)
377 return spam, conclusive, method, nil
378 }
379
380 return nil, false, methodNone, nil
381}
382