1package junk
2
3// see https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering
4// - todo: better html parsing?
5// - todo: try reading text in pdf?
6// - todo: try to detect language, have words per language? can be in the same dictionary. currently my dictionary is biased towards treating english as spam.
7
8import (
9 "bufio"
10 "fmt"
11 "io"
12 "os"
13 "strings"
14 "unicode"
15
16 "golang.org/x/net/html"
17
18 "go.etcd.io/bbolt"
19
20 "github.com/mjl-/mox/message"
21)
22
23func (f *Filter) tokenizeMail(path string) (bool, map[string]struct{}, error) {
24 mf, err := os.Open(path)
25 if err != nil {
26 return false, nil, err
27 }
28 defer func() {
29 err := mf.Close()
30 f.log.Check(err, "closing message file")
31 }()
32 fi, err := mf.Stat()
33 if err != nil {
34 return false, nil, err
35 }
36 p, _ := message.EnsurePart(f.log.Logger, false, mf, fi.Size())
37 words, err := f.ParseMessage(p)
38 return true, words, err
39}
40
41// ParseMessage reads a mail and returns a map with words.
42func (f *Filter) ParseMessage(p message.Part) (map[string]struct{}, error) {
43 metaWords := map[string]struct{}{}
44 textWords := map[string]struct{}{}
45 htmlWords := map[string]struct{}{}
46
47 hdrs, err := p.Header()
48 if err != nil {
49 return nil, fmt.Errorf("parsing headers: %v", err)
50 }
51
52 // Add words from the header, annotated with <field>+":".
53 // todo: add whether header is dkim-verified?
54 for k, l := range hdrs {
55 for _, h := range l {
56 switch k {
57 case "From", "To", "Cc", "Bcc", "Reply-To", "Subject", "Sender", "Return-Path":
58 // case "Subject", "To":
59 default:
60 continue
61 }
62 words := map[string]struct{}{}
63 f.tokenizeText(strings.NewReader(h), words)
64 for w := range words {
65 if len(w) <= 3 {
66 continue
67 }
68 s := k + ":" + w
69 if len(s) > bbolt.MaxKeySize {
70 continue
71 }
72 metaWords[s] = struct{}{}
73 }
74 }
75 }
76
77 if err := f.mailParse(p, metaWords, textWords, htmlWords); err != nil {
78 return nil, fmt.Errorf("parsing message: %w", err)
79 }
80
81 for w := range metaWords {
82 textWords[w] = struct{}{}
83 }
84 for w := range htmlWords {
85 textWords[w] = struct{}{}
86 }
87
88 return textWords, nil
89}
90
91// mailParse looks through the mail for the first text and html parts, and tokenizes their words.
92func (f *Filter) mailParse(p message.Part, metaWords, textWords, htmlWords map[string]struct{}) error {
93 ct := p.MediaType + "/" + p.MediaSubType
94
95 if ct == "TEXT/HTML" {
96 err := f.tokenizeHTML(p.ReaderUTF8OrBinary(), metaWords, htmlWords)
97 // log.Printf("html parsed, words %v", htmlWords)
98 return err
99 }
100 if ct == "" || strings.HasPrefix(ct, "TEXT/") {
101 err := f.tokenizeText(p.ReaderUTF8OrBinary(), textWords)
102 // log.Printf("text parsed, words %v", textWords)
103 return err
104 }
105 if p.Message != nil {
106 // Nested message, happens for forwarding.
107 if err := p.SetMessageReaderAt(); err != nil {
108 return fmt.Errorf("setting reader on nested message: %w", err)
109 }
110 return f.mailParse(*p.Message, metaWords, textWords, htmlWords)
111 }
112 for _, sp := range p.Parts {
113 if err := f.mailParse(sp, metaWords, textWords, htmlWords); err != nil {
114 return err
115 }
116 }
117 return nil
118}
119
120func looksRandom(s string) bool {
121 // Random strings, eg 2fvu9stm9yxhnlu. ASCII only and a many consonants in a stretch.
122 stretch := 0
123 const consonants = "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ23456789" // 0 and 1 may be used as o and l/i
124 stretches := 0
125 for _, c := range s {
126 if c >= 0x80 {
127 return false
128 }
129 if strings.ContainsRune(consonants, c) {
130 stretch++
131 continue
132 }
133 if stretch >= 6 {
134 stretches++
135 }
136 stretch = 0
137 }
138 if stretch >= 6 {
139 stretches++
140 }
141 return stretches > 0
142}
143
144func looksNumeric(s string) bool {
145 s = strings.TrimPrefix(s, "0x") // Hexadecimal.
146 var digits, hex, other, digitstretch, maxdigitstretch int
147 for _, c := range s {
148 if c >= '0' && c <= '9' {
149 digits++
150 digitstretch++
151 continue
152 } else if c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' {
153 hex++
154 } else {
155 other++
156 }
157 if digitstretch > maxdigitstretch {
158 maxdigitstretch = digitstretch
159 }
160 }
161 if digitstretch > maxdigitstretch {
162 maxdigitstretch = digitstretch
163 }
164 return maxdigitstretch >= 4 || other == 0 && maxdigitstretch >= 3
165}
166
167func (f *Filter) tokenizeText(r io.Reader, words map[string]struct{}) error {
168 b := &strings.Builder{}
169 var prev string
170 var prev2 string
171
172 wordAdd := func(s string) {
173 if len(s) > bbolt.MaxKeySize {
174 return
175 }
176 words[s] = struct{}{}
177 }
178
179 add := func() {
180 defer b.Reset()
181 if b.Len() <= 2 {
182 return
183 }
184
185 s := b.String()
186 s = strings.Trim(s, "'")
187 var nondigit bool
188 for _, c := range s {
189 if !unicode.IsDigit(c) {
190 nondigit = true
191 break
192 }
193 }
194
195 if !(nondigit && len(s) > 2) {
196 return
197 }
198
199 if looksRandom(s) {
200 return
201 }
202 if looksNumeric(s) {
203 return
204 }
205
206 // todo: do something for URLs, parse them? keep their domain only?
207
208 if f.Threegrams && prev2 != "" && prev != "" {
209 wordAdd(prev2 + " " + prev + " " + s)
210 }
211 if f.Twograms && prev != "" {
212 wordAdd(prev + " " + s)
213 }
214 if f.Onegrams {
215 wordAdd(s)
216 }
217 prev2 = prev
218 prev = s
219 }
220
221 br := bufio.NewReader(r)
222
223 peekLetter := func() bool {
224 c, _, err := br.ReadRune()
225 if err == nil {
226 err = br.UnreadRune()
227 }
228 return err == nil && unicode.IsLetter(c)
229 }
230
231 for {
232 c, _, err := br.ReadRune()
233 if err == io.EOF {
234 break
235 }
236 if err != nil {
237 return err
238 }
239 if !unicode.IsLetter(c) && !unicode.IsDigit(c) && (c != '\'' || b.Len() > 0 && peekLetter()) {
240 add()
241 } else {
242 b.WriteRune(unicode.ToLower(c))
243 }
244 }
245 add()
246 return nil
247}
248
249// tokenizeHTML parses html, and tokenizes its text into words.
250func (f *Filter) tokenizeHTML(r io.Reader, meta, words map[string]struct{}) error {
251 htmlReader := &htmlTextReader{
252 t: html.NewTokenizer(r),
253 meta: map[string]struct{}{},
254 }
255 return f.tokenizeText(htmlReader, words)
256}
257
258type htmlTextReader struct {
259 t *html.Tokenizer
260 meta map[string]struct{}
261 tagStack []string
262 buf []byte
263 err error
264}
265
266func (r *htmlTextReader) Read(buf []byte) (n int, err error) {
267 // todo: deal with invalid html better. the tokenizer is just tokenizing, we need to fix up the nesting etc. eg, rules say some elements close certain open elements.
268 // todo: deal with inline elements? they shouldn't cause a word break.
269
270 give := func(nbuf []byte) (int, error) {
271 n := min(len(buf), len(nbuf))
272 copy(buf, nbuf[:n])
273 nbuf = nbuf[n:]
274 if len(nbuf) < cap(r.buf) {
275 r.buf = r.buf[:len(nbuf)]
276 } else {
277 r.buf = make([]byte, len(nbuf), 3*len(nbuf)/2)
278 }
279 copy(r.buf, nbuf)
280 return n, nil
281 }
282
283 if len(r.buf) > 0 {
284 return give(r.buf)
285 }
286 if r.err != nil {
287 return 0, r.err
288 }
289
290 for {
291 switch r.t.Next() {
292 case html.ErrorToken:
293 r.err = r.t.Err()
294 return 0, r.err
295 case html.TextToken:
296 if len(r.tagStack) > 0 {
297 switch r.tagStack[len(r.tagStack)-1] {
298 case "script", "style", "svg":
299 continue
300 }
301 }
302 buf := r.t.Text()
303 if len(buf) > 0 {
304 return give(buf)
305 }
306 case html.StartTagToken:
307 tagBuf, moreAttr := r.t.TagName()
308 tag := string(tagBuf)
309 //log.Printf("tag %q %v", tag, r.tagStack)
310
311 if tag == "img" && moreAttr {
312 var key, val []byte
313 for moreAttr {
314 key, val, moreAttr = r.t.TagAttr()
315 if string(key) == "alt" && len(val) > 0 {
316 return give(val)
317 }
318 }
319 }
320
321 // Empty elements, https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
322 switch tag {
323 case "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr":
324 continue
325 }
326
327 r.tagStack = append(r.tagStack, tag)
328 case html.EndTagToken:
329 // log.Printf("tag pop %v", r.tagStack)
330 if len(r.tagStack) > 0 {
331 r.tagStack = r.tagStack[:len(r.tagStack)-1]
332 }
333 case html.SelfClosingTagToken:
334 case html.CommentToken:
335 case html.DoctypeToken:
336 }
337 }
338}
339