#!/usr/bin/env bun /** * scrape-flights.ts — Kayak flight price monitor * * ⚠️ POOR RESULTS — DO NOT RELY ON FOR BOOKING DECISIONS (as of 2026-04-20) * * Experience: Used to search LAX→CVG Apr 27 and CVG→LAX May 5. * The scraper reported $104–$116 economy fares. When we pulled up the actual * Kayak pages, real prices were $350–$639 (economy Frontier with 15h layovers, * or First class on AA/Delta at $618–$639). The $104 figure was almost certainly * the scraper mis-reading an ad, a promo teaser, or a dynamic element that * loaded before real results rendered. * * Root causes: * 1. Price extraction uses a generic "find any $NNN text" heuristic — picks up * banners, hotel ads, and partial prices injected before flight results load. * 2. Kayak aggressively bot-detects and serves degraded/empty pages. Tor exit * nodes are blocklisted by Kayak. Direct requests eventually get through but * the page state at scrape time may be incomplete. * 3. depTime/arrTime extraction never worked — selectors don't match Kayak DOM. * 4. Duration regex picked up "4h 0m" which was accurate once but not reliably. * * What actually works: open the Kayak links directly in a browser. * The targets and API POST pipeline (flight-monitor) are fine — the scraping * layer is what's broken. * * Scrapes economy and business class prices from Kayak for tracked flights * and posts results to the /api/flight-monitor endpoint. * * Usage: * bun scrape-flights.ts # scrape all flights and POST results * bun scrape-flights.ts --dry-run # print results, don't POST * bun scrape-flights.ts --debug # headed browser + /tmp screenshots * bun scrape-flights.ts --flight flight-oak-cvg-apr26-delta # single flight * * Env vars: * QUINN_MY_SERVICE_TOKEN — Bearer token for API auth (required unless --dry-run) * DRY_RUN=1 — same as --dry-run * DEBUG=1 — same as --debug * USE_TOR=1 — route through Tor SOCKS5 :9050 (Kayak blocklists Tor exit nodes, ineffective) */ import { chromium, type Browser, type BrowserContext, type Page } from 'playwright'; import { createLogger } from '@lilith/logger'; const log = createLogger('flight-scraper'); const API_BASE = process.env.API_BASE ?? 'http://localhost:3024'; const SERVICE_TOKEN = process.env.QUINN_MY_SERVICE_TOKEN ?? ''; const DRY_RUN = process.argv.includes('--dry-run') || process.env.DRY_RUN === '1'; const DEBUG = process.argv.includes('--debug') || process.env.DEBUG === '1'; const SINGLE = (() => { const i = process.argv.indexOf('--flight'); return i >= 0 ? process.argv[i + 1] : null; })(); // ── Target definitions ──────────────────────────────────────────────────────── interface FlightTarget { id: string; airline: string; route: string; origin: string; destination: string; depDate: string; searchUrl: string; bizUrl: string; monitor: boolean; } const ALL_TARGETS: FlightTarget[] = [ // ── LA → CVG (Apr 27 outbound — car pre-positioned in Palmdale, fly from LA) ─ { id: 'flight-lax-cvg-apr27', airline: 'any', route: 'LAX → CVG', origin: 'LAX', destination: 'CVG', depDate: '2026-04-27', searchUrl: 'https://www.kayak.com/flights/LAX-CVG/2026-04-27?sort=bestflight_a', bizUrl: 'https://www.kayak.com/flights/LAX-CVG/2026-04-27/business?sort=price_a', monitor: true, }, { id: 'flight-bur-cvg-apr27', airline: 'any', route: 'BUR → CVG', origin: 'BUR', destination: 'CVG', depDate: '2026-04-27', searchUrl: 'https://www.kayak.com/flights/BUR-CVG/2026-04-27?sort=bestflight_a', bizUrl: 'https://www.kayak.com/flights/BUR-CVG/2026-04-27/business?sort=price_a', monitor: true, }, { id: 'flight-cvg-bur-may4-delta', airline: 'Delta', route: 'CVG → BUR', origin: 'CVG', destination: 'BUR', depDate: '2026-05-04', searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-04?sort=price_a', bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-04/business?sort=price_a', monitor: true, }, { id: 'flight-cvg-bur-may1-aa', airline: 'American', route: 'CVG → BUR', origin: 'CVG', destination: 'BUR', depDate: '2026-05-01', searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-01?sort=price_a', bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-01/business?sort=price_a', monitor: true, }, // ── CVG → LA (May 5 & 6) ───────────────────────────────────────────────────── { id: 'flight-cvg-lax-may5', airline: 'any', route: 'CVG → LAX', origin: 'CVG', destination: 'LAX', depDate: '2026-05-05', searchUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-05?sort=price_a', bizUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-05/business?sort=price_a', monitor: true, }, { id: 'flight-cvg-bur-may5', airline: 'any', route: 'CVG → BUR', origin: 'CVG', destination: 'BUR', depDate: '2026-05-05', searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-05?sort=price_a', bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-05/business?sort=price_a', monitor: true, }, { id: 'flight-cvg-lax-may6', airline: 'any', route: 'CVG → LAX', origin: 'CVG', destination: 'LAX', depDate: '2026-05-06', searchUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-06?sort=price_a', bizUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-06/business?sort=price_a', monitor: true, }, { id: 'flight-cvg-bur-may6', airline: 'any', route: 'CVG → BUR', origin: 'CVG', destination: 'BUR', depDate: '2026-05-06', searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-06?sort=price_a', bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-06/business?sort=price_a', monitor: true, }, ]; // ── Scrape result types ─────────────────────────────────────────────────────── interface ScrapeResult { target: { id: string; airline: string; route: string }; snapshot: { price?: number; bizAvail?: boolean | null; bizPrice?: number; scarcity?: string; depTime?: string; arrTime?: string; duration?: string; scrapedAt: string; }; } // ── Kayak scraping ──────────────────────────────────────────────────────────── async function delay(min: number, max: number): Promise { const ms = min + Math.random() * (max - min); await new Promise((r) => setTimeout(r, ms)); } async function waitForResults(page: Page, timeout: number): Promise { try { await page.waitForFunction( () => { const priceEls = document.querySelectorAll('[class*="price"], [class*="Price"]'); for (const el of priceEls) { if (el.textContent?.includes('$') && /\$\d+/.test(el.textContent ?? '')) return true; } const resultItems = document.querySelectorAll('[class*="resultInner"], [class*="result-item"], [data-resultid]'); return resultItems.length > 0; }, { timeout }, ); return true; } catch { return false; } } async function waitForNoResults(page: Page, timeout: number): Promise { try { await page.waitForFunction( () => { const body = document.body.innerText ?? ''; return /no results/i.test(body) || /no flights/i.test(body) || /couldn.t find/i.test(body); }, { timeout }, ); return true; } catch { return false; } } interface EconomyData { price: number | null; scarcity: string | null; depTime: string | null; arrTime: string | null; duration: string | null; } async function scrapeEconomy(page: Page, target: FlightTarget): Promise { log.info(`scraping economy`, { id: target.id, url: target.searchUrl }); try { await page.goto(target.searchUrl, { waitUntil: 'networkidle', timeout: 30_000 }); } catch { log.debug(`networkidle timed out, falling back to domcontentloaded`, { id: target.id }); try { await page.goto(target.searchUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 }); } catch (err) { log.warn(`navigation failed`, { id: target.id, err: String(err) }); return { price: null, scarcity: null, depTime: null, arrTime: null }; } } if (DEBUG) { await page.screenshot({ path: `/tmp/flight-${target.id}-econ-loaded.png`, fullPage: false }); } const hasResults = await waitForResults(page, 30_000); if (!hasResults) { log.warn(`no price elements found for economy`, { id: target.id }); if (DEBUG) { await page.screenshot({ path: `/tmp/flight-${target.id}-econ-noresult.png`, fullPage: true }); } return { price: null, scarcity: null, depTime: null, arrTime: null }; } await delay(1000, 2000); if (DEBUG) { await page.screenshot({ path: `/tmp/flight-${target.id}-econ-results.png`, fullPage: false }); } const data = await page.evaluate((): EconomyData => { const allText = Array.from(document.querySelectorAll('*')) .filter((el) => { const t = el.textContent?.trim() ?? ''; return /^\$\d+$/.test(t) && el.children.length === 0; }) .map((el) => parseInt((el.textContent ?? '').replace(/\D/g, ''), 10)) .filter((n) => !isNaN(n) && n > 50 && n < 10000); const price = allText.length > 0 ? Math.min(...allText) : null; const scarcityMatch = document.body.innerText.match(/(\d+)\s*seats?\s*(left|remaining)/i); const scarcity = scarcityMatch ? scarcityMatch[0] : null; let depTime: string | null = null; let arrTime: string | null = null; const timeEls = Array.from(document.querySelectorAll('[class*="depart"], [class*="Depart"], [class*="time"], [class*="Time"]')); if (timeEls.length >= 2) { const dep = timeEls[0]?.textContent?.trim() ?? null; const arr = timeEls[1]?.textContent?.trim() ?? null; if (dep && /\d+:\d+\s*(am|pm)/i.test(dep)) depTime = dep; if (arr && /\d+:\d+\s*(am|pm)/i.test(arr)) arrTime = arr; } const durationMatch = document.body.innerText.match(/(\d+h\s*\d*m?|\d+\s*hr?\s*\d*\s*m(?:in)?)/i); const duration = durationMatch ? durationMatch[0].trim() : null; return { price, scarcity, depTime, arrTime, duration }; }); log.info(`economy result`, { id: target.id, price: data.price, scarcity: data.scarcity, duration: data.duration }); return data; } interface BizData { bizAvail: boolean | null; bizPrice: number | null; } async function scrapeBusiness(page: Page, target: FlightTarget): Promise { log.info(`scraping business class`, { id: target.id, url: target.bizUrl }); try { await page.goto(target.bizUrl, { waitUntil: 'networkidle', timeout: 30_000 }); } catch { try { await page.goto(target.bizUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 }); } catch (err) { log.warn(`biz navigation failed`, { id: target.id, err: String(err) }); return { bizAvail: null, bizPrice: null }; } } if (DEBUG) { await page.screenshot({ path: `/tmp/flight-${target.id}-biz-loaded.png`, fullPage: false }); } const [hasResults, hasNoResults] = await Promise.all([ waitForResults(page, 10_000), waitForNoResults(page, 10_000), ]); if (DEBUG) { await page.screenshot({ path: `/tmp/flight-${target.id}-biz-final.png`, fullPage: false }); } if (hasNoResults && !hasResults) { log.info(`business class: no results`, { id: target.id }); return { bizAvail: false, bizPrice: null }; } if (hasResults) { const bizPrice = await page.evaluate((): number | null => { const allText = Array.from(document.querySelectorAll('*')) .filter((el) => { const t = el.textContent?.trim() ?? ''; return /^\$\d+$/.test(t) && el.children.length === 0; }) .map((el) => parseInt((el.textContent ?? '').replace(/\D/g, ''), 10)) .filter((n) => !isNaN(n) && n > 50 && n < 50000); return allText.length > 0 ? Math.min(...allText) : null; }); log.info(`business class available`, { id: target.id, bizPrice }); return { bizAvail: true, bizPrice }; } log.warn(`business class result uncertain`, { id: target.id }); return { bizAvail: null, bizPrice: null }; } // ── API calls ───────────────────────────────────────────────────────────────── async function seedTargets(targets: FlightTarget[]): Promise { if (DRY_RUN) { log.info('dry-run: would seed targets', { count: targets.length }); return; } const res = await fetch(`${API_BASE}/api/flight-monitor/targets`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${SERVICE_TOKEN}`, }, body: JSON.stringify({ targets }), }); if (!res.ok) { const text = await res.text(); log.warn(`targets seed failed`, { status: res.status, body: text }); } else { const data = await res.json() as { seeded: number }; log.info(`targets seeded`, { seeded: data.seeded }); } } async function postSnapshots(results: ScrapeResult[]): Promise { if (DRY_RUN) { log.info('dry-run: would POST snapshots', { count: results.length }); for (const r of results) { log.info('snapshot', { id: r.target.id, ...r.snapshot }); } return; } const res = await fetch(`${API_BASE}/api/flight-monitor/snapshots`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${SERVICE_TOKEN}`, }, body: JSON.stringify({ results }), }); if (!res.ok) { const text = await res.text(); log.warn(`snapshot POST failed`, { status: res.status, body: text }); } else { const data = await res.json() as { inserted: number; diff: unknown[] }; log.info(`snapshots saved`, { inserted: data.inserted, changes: data.diff.length }); if (data.diff.length > 0) { log.info(`price/availability changes detected`, { diff: data.diff }); } } } // ── Main ────────────────────────────────────────────────────────────────────── const BROWSER_CONTEXT_OPTS = { userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', viewport: { width: 1440, height: 900 }, locale: 'en-US', timezoneId: 'America/Los_Angeles', } as const; const TOR_PROXY = process.env.TOR_PROXY ?? 'socks5://127.0.0.1:9050'; const USE_TOR = process.argv.includes('--tor') || process.env.USE_TOR === '1'; async function closePageSafely(page: Page, label: string): Promise { try { await page.close(); } catch (err) { log.debug(`${label} page.close threw (browser likely terminated by bot detection)`, { err: String(err) }); } } async function closeBrowserSafely(browser: Browser): Promise { try { await browser.close(); } catch (err) { log.debug('browser.close threw (process already gone)', { err: String(err) }); } } async function main(): Promise { let targets = ALL_TARGETS; if (SINGLE) { targets = targets.filter((t) => t.id === SINGLE); if (!targets.length) { log.error(`flight not found`, { id: SINGLE, available: ALL_TARGETS.map((t) => t.id) }); process.exit(1); } } log.info(`starting flight scrape`, { flights: targets.length, dryRun: DRY_RUN, debug: DEBUG }); if (!DRY_RUN && !SERVICE_TOKEN) { log.error('QUINN_MY_SERVICE_TOKEN env var is required (use --dry-run to skip posting)'); process.exit(1); } await seedTargets(targets); const launchArgs = USE_TOR ? [`--proxy-server=${TOR_PROXY}`] : []; if (USE_TOR) log.info('routing through Tor', { proxy: TOR_PROXY }); let browser: Browser = await chromium.launch({ headless: !DEBUG, args: launchArgs }); let context: BrowserContext = await browser.newContext(BROWSER_CONTEXT_OPTS); async function freshPage(): Promise { try { return await context.newPage(); } catch (err) { log.warn('browser context dead, relaunching', { err: String(err) }); await closeBrowserSafely(browser); browser = await chromium.launch({ headless: !DEBUG }); context = await browser.newContext(BROWSER_CONTEXT_OPTS); return await context.newPage(); } } const results: ScrapeResult[] = []; for (const target of targets) { const scrapedAt = new Date().toISOString(); let econ: EconomyData = { price: null, scarcity: null, depTime: null, arrTime: null }; try { const econPage = await freshPage(); try { econ = await scrapeEconomy(econPage, target); } finally { await closePageSafely(econPage, 'economy'); } } catch (err) { log.warn(`economy scrape threw`, { id: target.id, err: String(err) }); } await delay(2000, 3500); let biz: BizData = { bizAvail: null, bizPrice: null }; try { const bizPage = await freshPage(); try { biz = await scrapeBusiness(bizPage, target); } finally { await closePageSafely(bizPage, 'business'); } } catch (err) { log.warn(`biz scrape threw`, { id: target.id, err: String(err) }); } results.push({ target: { id: target.id, airline: target.airline, route: target.route }, snapshot: { price: econ.price ?? undefined, bizAvail: biz.bizAvail, bizPrice: biz.bizPrice ?? undefined, scarcity: econ.scarcity ?? undefined, depTime: econ.depTime ?? undefined, arrTime: econ.arrTime ?? undefined, duration: econ.duration ?? undefined, scrapedAt, }, }); await delay(1500, 3000); } await closeBrowserSafely(browser); const succeeded = results.filter((r) => r.snapshot.price != null).length; log.info(`scrape complete`, { succeeded, total: results.length }); await postSnapshots(results); } main().catch((err) => { log.error('fatal', { err: String(err) }); process.exit(1); });