1// Package dane verifies TLS certificates through DNSSEC-verified TLSA records.
2//
3// On the internet, TLS certificates are commonly verified by checking if they are
4// signed by one of many commonly trusted Certificate Authorities (CAs). This is
5// PKIX or WebPKI. With DANE, TLS certificates are verified through
6// DNSSEC-protected DNS records of type TLSA. These TLSA records specify the rules
7// for verification ("usage") and whether a full certificate ("selector" cert) is
8// checked or only its "subject public key info" ("selector" spki). The (hash of)
9// the certificate or "spki" is included in the TLSA record ("matchtype").
10//
11// DANE SMTP connections have two allowed "usages" (verification rules):
12// - DANE-EE, which only checks if the certificate or spki match, without the
13// WebPKI verification of expiration, name or signed-by-trusted-party verification.
14// - DANE-TA, which does verification similar to PKIX/WebPKI, but verifies against
15// a certificate authority ("trust anchor", or "TA") specified in the TLSA record
16// instead of the CA pool.
17//
18// DANE has two more "usages", that may be used with protocols other than SMTP:
19// - PKIX-EE, which matches the certificate or spki, and also verifies the
20// certificate against the CA pool.
21// - PKIX-TA, which verifies the certificate or spki against a "trust anchor"
22// specified in the TLSA record, that also has to be trusted by the CA pool.
23//
24// TLSA records are looked up for a specific port number, protocol (tcp/udp) and
25// host name. Each port can have different TLSA records. TLSA records must be
26// signed and verified with DNSSEC before they can be trusted and used.
27//
28// TLSA records are looked up under "TLSA candidate base domains". The domain
29// where the TLSA records are found is the "TLSA base domain". If the host to
30// connect to is a CNAME that can be followed with DNSSEC protection, it is the
31// first TLSA candidate base domain. If no protected records are found, the
32// original host name is the second TLSA candidate base domain.
33//
34// For TLS connections, the TLSA base domain is used with SNI during the
35// handshake.
36//
37// For TLS certificate verification that requires PKIX/WebPKI/trusted-anchor
38// verification (all except DANE-EE), the potential second TLSA candidate base
39// domain name is also valid. With SMTP, additionally for hosts found in MX records
40// for a "next-hop domain", the "original next-hop domain" (domain of an email
41// address to deliver to) is also a valid name, as is the "CNAME-expanded original
42// next-hop domain", bringing the potential total allowed names to four (if CNAMEs
43// are followed for the MX hosts).
44package dane
45
46// todo: why is https://datatracker.ietf.org/doc/html/draft-barnes-dane-uks-00 not in use? sounds reasonable.
47// todo: add a DialSRV function that accepts a domain name, looks up srv records, dials the service, verifies dane certificate and returns the connection. for ../rfc/7673
48
49import (
50 "bytes"
51 "context"
52 "crypto/sha256"
53 "crypto/sha512"
54 "crypto/tls"
55 "crypto/x509"
56 "errors"
57 "fmt"
58 "net"
59 "strings"
60 "time"
61
62 "github.com/prometheus/client_golang/prometheus"
63 "github.com/prometheus/client_golang/prometheus/promauto"
64
65 "github.com/mjl-/adns"
66
67 "github.com/mjl-/mox/dns"
68 "github.com/mjl-/mox/mlog"
69 "github.com/mjl-/mox/mox-"
70)
71
72var (
73 metricVerify = promauto.NewCounter(
74 prometheus.CounterOpts{
75 Name: "mox_dane_verify_total",
76 Help: "Total number of DANE verification attempts, including mox_dane_verify_errors_total.",
77 },
78 )
79 metricVerifyErrors = promauto.NewCounter(
80 prometheus.CounterOpts{
81 Name: "mox_dane_verify_errors_total",
82 Help: "Total number of DANE verification failures, causing connections to fail.",
83 },
84 )
85)
86
87var (
88 // ErrNoRecords means no TLSA records were found and host has not opted into DANE.
89 ErrNoRecords = errors.New("dane: no tlsa records")
90
91 // ErrInsecure indicates insecure DNS responses were encountered while looking up
92 // the host, CNAME records, or TLSA records.
93 ErrInsecure = errors.New("dane: dns lookups insecure")
94
95 // ErrNoMatch means some TLSA records were found, but none can be verified against
96 // the remote TLS certificate.
97 ErrNoMatch = errors.New("dane: no match between certificate and tlsa records")
98)
99
100// VerifyError is an error encountered while verifying a DANE TLSA record. For
101// example, an error encountered with x509 certificate trusted-anchor verification.
102// A TLSA record that does not match a TLS certificate is not a VerifyError.
103type VerifyError struct {
104 Err error // Underlying error, possibly from crypto/x509.
105 Record adns.TLSA // Cause of error.
106}
107
108// Error returns a string explaining this is a dane verify error along with the
109// underlying error.
110func (e VerifyError) Error() string {
111 return fmt.Sprintf("dane verify error: %s", e.Err)
112}
113
114// Unwrap returns the underlying error.
115func (e VerifyError) Unwrap() error {
116 return e.Err
117}
118
119// Dial looks up a DNSSEC-protected DANE TLSA record for the domain name and
120// port/service in address, checks for allowed usages, makes a network connection
121// and verifies the remote certificate against the TLSA records. If
122// verification succeeds, the verified record is returned.
123//
124// Different protocols require different usages. For example, SMTP with STARTTLS
125// for delivery only allows usages DANE-TA and DANE-EE. If allowedUsages is
126// non-nil, only the specified usages are taken into account when verifying, and
127// any others ignored.
128//
129// Errors that can be returned, possibly in wrapped form:
130// - ErrNoRecords, also in case the DNS response indicates "not found".
131// - adns.DNSError, potentially wrapping adns.ExtendedError of which some can
132// indicate DNSSEC errors.
133// - ErrInsecure
134// - VerifyError, potentially wrapping errors from crypto/x509.
135func Dial(ctx context.Context, resolver dns.Resolver, network, address string, allowedUsages []adns.TLSAUsage) (net.Conn, adns.TLSA, error) {
136 log := mlog.New("dane").WithContext(ctx)
137
138 // Split host and port.
139 host, portstr, err := net.SplitHostPort(address)
140 if err != nil {
141 return nil, adns.TLSA{}, fmt.Errorf("parsing address: %w", err)
142 }
143 port, err := resolver.LookupPort(ctx, network, portstr)
144 if err != nil {
145 return nil, adns.TLSA{}, fmt.Errorf("parsing port: %w", err)
146 }
147
148 hostDom, err := dns.ParseDomain(strings.TrimSuffix(host, "."))
149 if err != nil {
150 return nil, adns.TLSA{}, fmt.Errorf("parsing host: %w", err)
151 }
152
153 // ../rfc/7671:1015
154 // First follow CNAMEs for host. If the path to the final name is secure, we must
155 // lookup TLSA there first, then fallback to the original name. If the final name
156 // is secure that's also the SNI server name we must use, with the original name as
157 // allowed host during certificate name checks (for all TLSA usages other than
158 // DANE-EE).
159 cnameDom := hostDom
160 cnameAuthentic := true
161 for i := 0; ; i += 1 {
162 if i == 10 {
163 return nil, adns.TLSA{}, fmt.Errorf("too many cname lookups")
164 }
165 cname, cnameResult, err := resolver.LookupCNAME(ctx, cnameDom.ASCII+".")
166 cnameAuthentic = cnameAuthentic && cnameResult.Authentic
167 if !cnameResult.Authentic && i == 0 {
168 return nil, adns.TLSA{}, fmt.Errorf("%w: cname lookup insecure", ErrInsecure)
169 } else if dns.IsNotFound(err) {
170 break
171 } else if err != nil {
172 return nil, adns.TLSA{}, fmt.Errorf("resolving cname %s: %w", cnameDom, err)
173 } else if d, err := dns.ParseDomain(strings.TrimSuffix(cname, ".")); err != nil {
174 return nil, adns.TLSA{}, fmt.Errorf("parsing cname: %w", err)
175 } else {
176 cnameDom = d
177 }
178 }
179
180 // We lookup the IP.
181 ipnetwork := "ip"
182 if strings.HasSuffix(network, "4") {
183 ipnetwork += "4"
184 } else if strings.HasSuffix(network, "6") {
185 ipnetwork += "6"
186 }
187 ips, _, err := resolver.LookupIP(ctx, ipnetwork, cnameDom.ASCII+".")
188 // note: For SMTP with opportunistic DANE we would stop here with an insecure
189 // response. But as long as long as we have a verified original tlsa base name, we
190 // can continue with regular DANE.
191 if err != nil {
192 return nil, adns.TLSA{}, fmt.Errorf("resolving ips: %w", err)
193 } else if len(ips) == 0 {
194 return nil, adns.TLSA{}, &adns.DNSError{Err: "no ips for host", Name: cnameDom.ASCII, IsNotFound: true}
195 }
196
197 // Lookup TLSA records. If resolving CNAME was secure, we try that first. Otherwise
198 // we try at the secure original domain.
199 baseDom := hostDom
200 if cnameAuthentic {
201 baseDom = cnameDom
202 }
203 var records []adns.TLSA
204 var result adns.Result
205 for {
206 var err error
207 records, result, err = resolver.LookupTLSA(ctx, port, network, baseDom.ASCII+".")
208 // If no (secure) records can be found at the final cname, and there is an original
209 // name, try at original name.
210 // ../rfc/7671:1015
211 if baseDom != hostDom && (dns.IsNotFound(err) || !result.Authentic) {
212 baseDom = hostDom
213 continue
214 }
215 if !result.Authentic {
216 return nil, adns.TLSA{}, ErrInsecure
217 } else if dns.IsNotFound(err) {
218 return nil, adns.TLSA{}, ErrNoRecords
219 } else if err != nil {
220 return nil, adns.TLSA{}, fmt.Errorf("lookup dane tlsa records: %w", err)
221 }
222 break
223 }
224
225 // Keep only the allowed usages.
226 if allowedUsages != nil {
227 o := 0
228 for _, r := range records {
229 for _, usage := range allowedUsages {
230 if r.Usage == usage {
231 records[o] = r
232 o++
233 break
234 }
235 }
236 }
237 records = records[:o]
238 if len(records) == 0 {
239 // No point in dialing when we know we won't be able to verify the remote TLS
240 // certificate.
241 return nil, adns.TLSA{}, fmt.Errorf("no usable tlsa records remaining: %w", ErrNoMatch)
242 }
243 }
244
245 // We use the base domain for SNI, allowing the original domain as well.
246 // ../rfc/7671:1021
247 var moreAllowedHosts []dns.Domain
248 if baseDom != hostDom {
249 moreAllowedHosts = []dns.Domain{hostDom}
250 }
251
252 // Dial the remote host.
253 timeout := 30 * time.Second
254 if deadline, ok := ctx.Deadline(); ok && len(ips) > 0 {
255 timeout = time.Until(deadline) / time.Duration(len(ips))
256 }
257 dialer := &net.Dialer{Timeout: timeout}
258 var conn net.Conn
259 var dialErrs []error
260 for _, ip := range ips {
261 addr := net.JoinHostPort(ip.String(), portstr)
262 c, err := dialer.DialContext(ctx, network, addr)
263 if err != nil {
264 dialErrs = append(dialErrs, err)
265 continue
266 }
267 conn = c
268 break
269 }
270 if conn == nil {
271 return nil, adns.TLSA{}, errors.Join(dialErrs...)
272 }
273
274 var verifiedRecord adns.TLSA
275 config := TLSClientConfig(log, records, baseDom, moreAllowedHosts, &verifiedRecord)
276 tlsConn := tls.Client(conn, &config)
277 if err := tlsConn.HandshakeContext(ctx); err != nil {
278 conn.Close()
279 return nil, adns.TLSA{}, err
280 }
281 return tlsConn, verifiedRecord, nil
282}
283
284// TLSClientConfig returns a tls.Config to be used for dialing/handshaking a
285// TLS connection with DANE verification.
286//
287// Callers should only pass records that are allowed for the use of DANE. DANE
288// with SMTP only allows DANE-EE and DANE-TA usages, not the PKIX-usages.
289//
290// The config has InsecureSkipVerify set to true, with a custom VerifyConnection
291// function for verifying DANE. Its VerifyConnection can return ErrNoMatch and
292// additionally one or more (wrapped) errors of type VerifyError.
293//
294// The TLS config uses allowedHost for SNI.
295//
296// If verifiedRecord is not nil, it is set to the record that was successfully
297// verified, if any.
298func TLSClientConfig(log *mlog.Log, records []adns.TLSA, allowedHost dns.Domain, moreAllowedHosts []dns.Domain, verifiedRecord *adns.TLSA) tls.Config {
299 return tls.Config{
300 ServerName: allowedHost.ASCII, // For SNI.
301 InsecureSkipVerify: true,
302 VerifyConnection: func(cs tls.ConnectionState) error {
303 verified, record, err := Verify(log, records, cs, allowedHost, moreAllowedHosts)
304 log.Debugx("dane verification", err, mlog.Field("verified", verified), mlog.Field("record", record))
305 if verified {
306 if verifiedRecord != nil {
307 *verifiedRecord = record
308 }
309 return nil
310 } else if err == nil {
311 return ErrNoMatch
312 }
313 return fmt.Errorf("%w, and error(s) encountered during verification: %w", ErrNoMatch, err)
314 },
315 MinVersion: tls.VersionTLS12, // ../rfc/8996:31 ../rfc/8997:66
316 }
317}
318
319// Verify checks if the TLS connection state can be verified against DANE TLSA
320// records.
321//
322// allowedHost along with the optional moreAllowedHosts are the host names that are
323// allowed during certificate verification (as used by PKIX-TA, PKIX-EE, DANE-TA,
324// but not DANE-EE). A typical connection would allow just one name, but some uses
325// of DANE allow multiple, like SMTP which allow up to four valid names for a TLS
326// certificate based on MX/CNAME/TLSA/DNSSEC lookup results.
327//
328// When one of the records matches, Verify returns true, along with the matching
329// record and a nil error.
330// If there is no match, then in the typical case false, a zero record value and a
331// nil error is returned.
332// If an error is encountered while verifying a record, e.g. for x509
333// trusted-anchor verification, an error may be returned, typically one or more
334// (wrapped) errors of type VerifyError.
335func Verify(log *mlog.Log, records []adns.TLSA, cs tls.ConnectionState, allowedHost dns.Domain, moreAllowedHosts []dns.Domain) (verified bool, matching adns.TLSA, rerr error) {
336 metricVerify.Inc()
337 if len(records) == 0 {
338 metricVerifyErrors.Inc()
339 return false, adns.TLSA{}, fmt.Errorf("verify requires at least one tlsa record")
340 }
341 var errs []error
342 for _, r := range records {
343 ok, err := verifySingle(log, r, cs, allowedHost, moreAllowedHosts)
344 if err != nil {
345 errs = append(errs, VerifyError{err, r})
346 } else if ok {
347 return true, r, nil
348 }
349 }
350 metricVerifyErrors.Inc()
351 return false, adns.TLSA{}, errors.Join(errs...)
352}
353
354// verifySingle verifies the TLS connection against a single DANE TLSA record.
355//
356// If the remote TLS certificate matches with the TLSA record, true is
357// returned. Errors may be encountered while verifying, e.g. when checking one
358// of the allowed hosts against a TLSA record. A typical non-matching/verified
359// TLSA record returns a nil error. But in some cases, e.g. when encountering
360// errors while verifying certificates against a trust-anchor, an error can be
361// returned with one or more underlying x509 verification errors. A nil-nil error
362// is only returned when verified is false.
363func verifySingle(log *mlog.Log, tlsa adns.TLSA, cs tls.ConnectionState, allowedHost dns.Domain, moreAllowedHosts []dns.Domain) (verified bool, rerr error) {
364 if len(cs.PeerCertificates) == 0 {
365 return false, fmt.Errorf("no server certificate")
366 }
367
368 match := func(cert *x509.Certificate) bool {
369 var buf []byte
370 switch tlsa.Selector {
371 case adns.TLSASelectorCert:
372 buf = cert.Raw
373 case adns.TLSASelectorSPKI:
374 buf = cert.RawSubjectPublicKeyInfo
375 default:
376 return false
377 }
378
379 switch tlsa.MatchType {
380 case adns.TLSAMatchTypeFull:
381 case adns.TLSAMatchTypeSHA256:
382 d := sha256.Sum256(buf)
383 buf = d[:]
384 case adns.TLSAMatchTypeSHA512:
385 d := sha512.Sum512(buf)
386 buf = d[:]
387 default:
388 return false
389 }
390
391 return bytes.Equal(buf, tlsa.CertAssoc)
392 }
393
394 pkixVerify := func(host dns.Domain) ([][]*x509.Certificate, error) {
395 // Default Verify checks for expiration. We pass the host name to check. And we
396 // configure the intermediates. The roots are filled in by the x509 package.
397 opts := x509.VerifyOptions{
398 DNSName: host.ASCII,
399 Intermediates: x509.NewCertPool(),
400 Roots: mox.Conf.Static.TLS.CertPool,
401 }
402 for _, cert := range cs.PeerCertificates[1:] {
403 opts.Intermediates.AddCert(cert)
404 }
405 chains, err := cs.PeerCertificates[0].Verify(opts)
406 return chains, err
407 }
408
409 switch tlsa.Usage {
410 case adns.TLSAUsagePKIXTA:
411 // We cannot get at the system trusted ca certificates to look for the trusted
412 // anchor. So we just ask Go to verify, then see if any of the chains include the
413 // ca certificate.
414 var errs []error
415 for _, host := range append([]dns.Domain{allowedHost}, moreAllowedHosts...) {
416 chains, err := pkixVerify(host)
417 log.Debugx("pkix-ta verify", err)
418 if err != nil {
419 errs = append(errs, err)
420 continue
421 }
422 // The chains by x509's Verify should include the longest possible match, so it is
423 // sure to include the trusted anchor. ../rfc/7671:835
424 for _, chain := range chains {
425 // If pkix verified, check if any of the certificates match.
426 for i := len(chain) - 1; i >= 0; i-- {
427 if match(chain[i]) {
428 return true, nil
429 }
430 }
431 }
432 }
433 return false, errors.Join(errs...)
434
435 case adns.TLSAUsagePKIXEE:
436 // Check for a certificate match.
437 if !match(cs.PeerCertificates[0]) {
438 return false, nil
439 }
440 // And do regular pkix checks, ../rfc/7671:799
441 var errs []error
442 for _, host := range append([]dns.Domain{allowedHost}, moreAllowedHosts...) {
443 _, err := pkixVerify(host)
444 log.Debugx("pkix-ee verify", err)
445 if err == nil {
446 return true, nil
447 }
448 errs = append(errs, err)
449 }
450 return false, errors.Join(errs...)
451
452 case adns.TLSAUsageDANETA:
453 // We set roots, so the system defaults don't get used. Verify checks the host name
454 // (set below) and checks for expiration.
455 opts := x509.VerifyOptions{
456 Roots: x509.NewCertPool(),
457 }
458
459 // If the full certificate was included, we must add it to the valid roots, the TLS
460 // server may not send it. ../rfc/7671:692
461 var found bool
462 if tlsa.Selector == adns.TLSASelectorCert && tlsa.MatchType == adns.TLSAMatchTypeFull {
463 cert, err := x509.ParseCertificate(tlsa.CertAssoc)
464 if err != nil {
465 log.Debugx("parsing full exact certificate from tlsa record to use as root for usage dane-trusted-anchor", err)
466 // Continue anyway, perhaps the servers sends it again in a way that the tls package can parse? (unlikely)
467 } else {
468 opts.Roots.AddCert(cert)
469 found = true
470 }
471 }
472
473 for _, cert := range cs.PeerCertificates {
474 if match(cert) {
475 opts.Roots.AddCert(cert)
476 found = true
477 break
478 }
479 }
480 if !found {
481 // Trusted anchor was not found in TLS certificates so we won't be able to
482 // verify.
483 return false, nil
484 }
485
486 // Trusted anchor was found, still need to verify.
487 var errs []error
488 for _, host := range append([]dns.Domain{allowedHost}, moreAllowedHosts...) {
489 opts.DNSName = host.ASCII
490 _, err := cs.PeerCertificates[0].Verify(opts)
491 if err == nil {
492 return true, nil
493 }
494 errs = append(errs, err)
495 }
496 return false, errors.Join(errs...)
497
498 case adns.TLSAUsageDANEEE:
499 // ../rfc/7250 is about raw public keys instead of x.509 certificates in tls
500 // handshakes. Go's crypto/tls does not implement the extension (see
501 // crypto/tls/common.go, the extensions values don't appear in the
502 // rfc, but have values 19 and 20 according to
503 // https://www.iana.org/assignments/tls-extensiontype-values/tls-extensiontype-values.xhtml#tls-extensiontype-values-1
504 // ../rfc/7671:1148 mentions the raw public keys are allowed. It's still
505 // questionable that this is commonly implemented. For now the world can probably
506 // live with an ignored certificate wrapped around the subject public key info.
507
508 // We don't verify host name in certificate, ../rfc/7671:489
509 // And we don't check for expiration. ../rfc/7671:527
510 // The whole point of this type is to have simple secure infrastructure that
511 // doesn't automatically expire (at the most inconvenient times).
512 return match(cs.PeerCertificates[0]), nil
513
514 default:
515 // Unknown, perhaps defined in the future. Not an error.
516 log.Debug("unrecognized tlsa usage, skipping", mlog.Field("tlsausage", tlsa.Usage))
517 return false, nil
518 }
519}
520