macsync/@packages/imail/Sources/IMailSync/MIMEParser.swift

383 lines
16 KiB
Swift

import Foundation
// MARK: - Output Types
/// Parsed representation of a MIME message, matching the shape produced by
/// `mailparser`'s `simpleParser` on the server side.
///
/// Fields mirror `icloud-sync.service.ts` `processMessage()` columns:
/// `messageId`, `subject`, `from`, `to`, `cc`, `text`, `html`, `date`, `inReplyTo`, `references`.
public struct ParsedMIMEMessage: Sendable {
public let messageId: String?
public let subject: String?
public let from: MIMEAddress?
public let to: [MIMEAddress]
public let cc: [MIMEAddress]
public let replyTo: [MIMEAddress]
public let inReplyTo: String?
public let references: [String]
public let date: Date?
public let text: String?
public let html: String?
public let hasAttachments: Bool
}
public struct MIMEAddress: Sendable {
public let address: String
public let name: String?
}
// MARK: - Parser
/// A minimal MIME parser that produces `ParsedMIMEMessage` from raw RFC 5322 source.
///
/// Covers the fields needed to match `simpleParser` output shape used in
/// `icloud-sync.service.ts`. Does NOT decode nested MIME parts beyond the primary
/// text/html bodies attachment handling is deferred to server-side processing.
///
/// Implementation notes:
/// - Headers are RFC 2047 decoded (Q-encoding and B-encoding).
/// - Multipart/alternative bodies are walked to extract text/plain and text/html.
/// - Only flat multipart is supported deeply nested multipart (e.g. multipart/related
/// wrapping multipart/alternative) extracts the first matching body part.
public struct MIMEParser {
// MARK: - Public API
/// Parse a raw RFC 5322 message into `ParsedMIMEMessage`.
public static func parse(_ source: String) -> ParsedMIMEMessage {
let (headerBlock, bodyBlock) = splitHeadersAndBody(source)
let headers = parseHeaders(headerBlock)
let contentType = headers["content-type"] ?? "text/plain"
let (text, html, hasAttachments) = extractBodies(from: bodyBlock, contentType: contentType, headers: headers, fullSource: source)
return ParsedMIMEMessage(
messageId: headers["message-id"].map { cleanAngleBrackets($0) },
subject: headers["subject"].map { decodeRFC2047($0) },
from: headers["from"].flatMap { parseSingleAddress($0) },
to: headers["to"].map { parseAddressList($0) } ?? [],
cc: headers["cc"].map { parseAddressList($0) } ?? [],
replyTo: headers["reply-to"].map { parseAddressList($0) } ?? [],
inReplyTo: headers["in-reply-to"].map { cleanAngleBrackets($0) },
references: parseReferences(headers["references"]),
date: headers["date"].flatMap { parseDate($0) },
text: text,
html: html,
hasAttachments: hasAttachments
)
}
// MARK: - Header Parsing
private static func splitHeadersAndBody(_ source: String) -> (headers: String, body: String) {
// Headers end at the first blank line (\r\n\r\n or \n\n)
if let range = source.range(of: "\r\n\r\n") {
return (String(source[source.startIndex..<range.lowerBound]),
String(source[range.upperBound...]))
}
if let range = source.range(of: "\n\n") {
return (String(source[source.startIndex..<range.lowerBound]),
String(source[range.upperBound...]))
}
return (source, "")
}
/// Parse RFC 5322 headers into a lowercased-key dictionary.
/// Handles folded headers (continuation lines starting with whitespace).
private static func parseHeaders(_ block: String) -> [String: String] {
var headers: [String: String] = [:]
var currentKey: String?
var currentValue: String = ""
let lines = block.components(separatedBy: .newlines)
for line in lines {
if line.isEmpty { continue }
let firstChar = line.unicodeScalars.first?.value ?? 0
if firstChar == 0x20 || firstChar == 0x09 {
// Folded continuation
currentValue += " " + line.trimmingCharacters(in: .whitespaces)
} else if let colonIdx = line.firstIndex(of: ":") {
if let key = currentKey, !currentValue.isEmpty {
headers[key] = currentValue.trimmingCharacters(in: .whitespaces)
}
currentKey = String(line[line.startIndex..<colonIdx]).lowercased().trimmingCharacters(in: .whitespaces)
currentValue = String(line[line.index(after: colonIdx)...]).trimmingCharacters(in: .whitespaces)
}
}
if let key = currentKey, !currentValue.isEmpty {
headers[key] = currentValue.trimmingCharacters(in: .whitespaces)
}
return headers
}
// MARK: - Body Extraction
private static func extractBodies(
from body: String,
contentType: String,
headers: [String: String],
fullSource: String
) -> (text: String?, html: String?, hasAttachments: Bool) {
let ctLower = contentType.lowercased()
if ctLower.hasPrefix("text/plain") {
return (decodeBody(body, transferEncoding: headers["content-transfer-encoding"]), nil, false)
}
if ctLower.hasPrefix("text/html") {
return (nil, decodeBody(body, transferEncoding: headers["content-transfer-encoding"]), false)
}
guard ctLower.hasPrefix("multipart/") else {
// Unknown content-type, return raw body as text
return (body.isEmpty ? nil : body, nil, false)
}
guard let boundary = extractBoundary(contentType) else {
return (nil, nil, false)
}
let parts = splitMultipart(body, boundary: boundary)
var textPart: String?
var htmlPart: String?
var hasAttachments = false
for part in parts {
let (partHeaders, partBody) = splitHeadersAndBody(part)
let partParsed = parseHeaders(partHeaders)
let partCT = (partParsed["content-type"] ?? "text/plain").lowercased()
let partCTE = partParsed["content-transfer-encoding"]
let disposition = (partParsed["content-disposition"] ?? "").lowercased()
if disposition.contains("attachment") {
hasAttachments = true
continue
}
if partCT.hasPrefix("text/plain"), textPart == nil {
textPart = decodeBody(partBody, transferEncoding: partCTE)
} else if partCT.hasPrefix("text/html"), htmlPart == nil {
htmlPart = decodeBody(partBody, transferEncoding: partCTE)
} else if partCT.hasPrefix("multipart/") {
// One level of recursion for multipart/alternative nested inside multipart/mixed
let (inner, _, _) = extractBodies(from: partBody, contentType: partParsed["content-type"] ?? partCT,
headers: partParsed, fullSource: partBody)
if textPart == nil { textPart = inner }
} else if !partCT.hasPrefix("text/") && !partCT.isEmpty {
hasAttachments = true
}
}
return (textPart, htmlPart, hasAttachments)
}
private static func extractBoundary(_ contentType: String) -> String? {
// boundary="..." or boundary=...
let components = contentType.components(separatedBy: ";")
for comp in components {
let trimmed = comp.trimmingCharacters(in: .whitespaces)
if trimmed.lowercased().hasPrefix("boundary=") {
var value = String(trimmed.dropFirst("boundary=".count))
.trimmingCharacters(in: .whitespaces)
if value.hasPrefix("\"") && value.hasSuffix("\"") {
value = String(value.dropFirst().dropLast())
}
return value
}
}
return nil
}
private static func splitMultipart(_ body: String, boundary: String) -> [String] {
let delimiter = "--\(boundary)"
let endDelimiter = "--\(boundary)--"
var parts: [String] = []
var currentPart = ""
var inPart = false
for line in body.components(separatedBy: .newlines) {
if line.trimmingCharacters(in: .whitespaces) == endDelimiter {
if inPart && !currentPart.isEmpty { parts.append(currentPart) }
break
}
if line.trimmingCharacters(in: .whitespaces) == delimiter {
if inPart && !currentPart.isEmpty { parts.append(currentPart) }
currentPart = ""
inPart = true
continue
}
if inPart { currentPart += line + "\n" }
}
return parts
}
private static func decodeBody(_ body: String, transferEncoding: String?) -> String? {
guard !body.isEmpty else { return nil }
let enc = (transferEncoding ?? "").lowercased().trimmingCharacters(in: .whitespaces)
if enc == "base64" {
let stripped = body.components(separatedBy: .newlines).joined()
guard let data = Data(base64Encoded: stripped, options: .ignoreUnknownCharacters),
let decoded = String(data: data, encoding: .utf8)
?? String(data: data, encoding: .isoLatin1) else {
return body // Return raw if we can't decode
}
return decoded
}
if enc == "quoted-printable" {
return decodeQuotedPrintable(body)
}
return body
}
// MARK: - Quoted-Printable
private static func decodeQuotedPrintable(_ input: String) -> String {
var result = ""
var idx = input.startIndex
// Join soft-line-breaks (= at end of line) before processing
let joined = input.replacingOccurrences(of: "=\r\n", with: "")
.replacingOccurrences(of: "=\n", with: "")
idx = joined.startIndex
while idx < joined.endIndex {
let c = joined[idx]
if c == "=" {
let next = joined.index(after: idx)
let afterNext = next < joined.endIndex ? joined.index(after: next) : joined.endIndex
if next < joined.endIndex && afterNext <= joined.endIndex {
let hex = String(joined[next..<afterNext])
if let codePoint = UInt8(hex, radix: 16) {
result += String(UnicodeScalar(codePoint))
idx = afterNext
continue
}
}
}
result.append(c)
idx = joined.index(after: idx)
}
return result
}
// MARK: - Address Parsing
private static func parseAddressList(_ value: String) -> [MIMEAddress] {
// Split on commas that are not inside angle brackets or quotes
var addresses: [MIMEAddress] = []
var current = ""
var inAngle = false
var inQuote = false
for ch in value {
if ch == "<" { inAngle = true }
else if ch == ">" { inAngle = false }
else if ch == "\"" { inQuote.toggle() }
else if ch == "," && !inAngle && !inQuote {
if let addr = parseSingleAddress(current.trimmingCharacters(in: .whitespaces)) {
addresses.append(addr)
}
current = ""
continue
}
current.append(ch)
}
if let addr = parseSingleAddress(current.trimmingCharacters(in: .whitespaces)) {
addresses.append(addr)
}
return addresses
}
private static func parseSingleAddress(_ raw: String) -> MIMEAddress? {
let decoded = decodeRFC2047(raw)
if let ltRange = decoded.range(of: "<"),
let gtRange = decoded.range(of: ">"),
ltRange.lowerBound < gtRange.lowerBound {
let addr = String(decoded[ltRange.upperBound..<gtRange.lowerBound]).trimmingCharacters(in: .whitespaces)
let name = String(decoded[decoded.startIndex..<ltRange.lowerBound])
.trimmingCharacters(in: .whitespaces)
.trimmingCharacters(in: CharacterSet(charactersIn: "\""))
guard !addr.isEmpty else { return nil }
return MIMEAddress(address: addr, name: name.isEmpty ? nil : name)
}
let clean = decoded.trimmingCharacters(in: .whitespaces)
guard !clean.isEmpty else { return nil }
return MIMEAddress(address: clean, name: nil)
}
// MARK: - RFC 2047 Decode
/// Decode RFC 2047 encoded words: =?charset?encoding?text?=
static func decodeRFC2047(_ input: String) -> String {
var result = input
// Pattern: =?<charset>?<B|Q>?<encoded>?=
let pattern = #"=\?([^?]+)\?([BQbq])\?([^?]*)\?="#
guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else {
return input
}
let matches = regex.matches(in: input, range: NSRange(input.startIndex..., in: input))
var offset = 0
for match in matches {
guard match.numberOfRanges == 4 else { continue }
let charsetRange = Range(match.range(at: 1), in: input)!
let encodingRange = Range(match.range(at: 2), in: input)!
let textRange = Range(match.range(at: 3), in: input)!
let fullRange = Range(match.range(at: 0), in: input)!
let charset = String(input[charsetRange]).lowercased()
let encoding = String(input[encodingRange]).uppercased()
let text = String(input[textRange])
let cfEncoding: String.Encoding = charset.contains("utf-8") ? .utf8 :
charset.contains("iso-8859-1") ? .isoLatin1 : .utf8
var decoded: String?
if encoding == "B" {
if let data = Data(base64Encoded: text) {
decoded = String(data: data, encoding: cfEncoding)
}
} else if encoding == "Q" {
let qpText = text.replacingOccurrences(of: "_", with: " ")
decoded = decodeQuotedPrintable(qpText)
}
if let d = decoded {
let nsRange = NSRange(fullRange, in: input)
// Offset adjustments for iterative replacement
let adjustedRange = NSRange(location: nsRange.location + offset, length: nsRange.length)
let before = result.count
result = (result as NSString).replacingCharacters(in: adjustedRange, with: d)
offset += result.count - before
}
}
return result
}
// MARK: - Misc Helpers
private static func cleanAngleBrackets(_ s: String) -> String {
var result = s.trimmingCharacters(in: .whitespaces)
if result.hasPrefix("<") { result = String(result.dropFirst()) }
if result.hasSuffix(">") { result = String(result.dropLast()) }
return result.trimmingCharacters(in: .whitespaces)
}
private static func parseReferences(_ raw: String?) -> [String] {
guard let raw else { return [] }
return raw.components(separatedBy: .whitespaces)
.map { cleanAngleBrackets($0) }
.filter { !$0.isEmpty }
}
private static func parseDate(_ raw: String) -> Date? {
let formatters: [(String, Locale?)] = [
("EEE, dd MMM yyyy HH:mm:ss Z", Locale(identifier: "en_US_POSIX")),
("dd MMM yyyy HH:mm:ss Z", Locale(identifier: "en_US_POSIX")),
("EEE, dd MMM yyyy HH:mm:ss z", Locale(identifier: "en_US_POSIX")),
]
let df = DateFormatter()
for (format, locale) in formatters {
df.dateFormat = format
if let locale { df.locale = locale }
if let date = df.date(from: raw) { return date }
}
return nil
}
}