lilith-platform.live/codebase/@features/my/scrape-scripts/scrape-flights.ts
2026-05-14 18:56:09 -07:00

529 lines
19 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bun
/**
* scrape-flights.ts — Kayak flight price monitor
*
* ⚠️ POOR RESULTS — DO NOT RELY ON FOR BOOKING DECISIONS (as of 2026-04-20)
*
* Experience: Used to search LAX→CVG Apr 27 and CVG→LAX May 5.
* The scraper reported $104$116 economy fares. When we pulled up the actual
* Kayak pages, real prices were $350$639 (economy Frontier with 15h layovers,
* or First class on AA/Delta at $618$639). The $104 figure was almost certainly
* the scraper mis-reading an ad, a promo teaser, or a dynamic element that
* loaded before real results rendered.
*
* Root causes:
* 1. Price extraction uses a generic "find any $NNN text" heuristic — picks up
* banners, hotel ads, and partial prices injected before flight results load.
* 2. Kayak aggressively bot-detects and serves degraded/empty pages. Tor exit
* nodes are blocklisted by Kayak. Direct requests eventually get through but
* the page state at scrape time may be incomplete.
* 3. depTime/arrTime extraction never worked — selectors don't match Kayak DOM.
* 4. Duration regex picked up "4h 0m" which was accurate once but not reliably.
*
* What actually works: open the Kayak links directly in a browser.
* The targets and API POST pipeline (flight-monitor) are fine — the scraping
* layer is what's broken.
*
* Scrapes economy and business class prices from Kayak for tracked flights
* and posts results to the /api/flight-monitor endpoint.
*
* Usage:
* bun scrape-flights.ts # scrape all flights and POST results
* bun scrape-flights.ts --dry-run # print results, don't POST
* bun scrape-flights.ts --debug # headed browser + /tmp screenshots
* bun scrape-flights.ts --flight flight-oak-cvg-apr26-delta # single flight
*
* Env vars:
* QUINN_MY_SERVICE_TOKEN — Bearer token for API auth (required unless --dry-run)
* DRY_RUN=1 — same as --dry-run
* DEBUG=1 — same as --debug
* USE_TOR=1 — route through Tor SOCKS5 :9050 (Kayak blocklists Tor exit nodes, ineffective)
*/
import { chromium, type Browser, type BrowserContext, type Page } from 'playwright';
import { createLogger } from '@lilith/logger';
const log = createLogger('flight-scraper');
const API_BASE = process.env.API_BASE ?? 'http://localhost:3024';
const SERVICE_TOKEN = process.env.QUINN_MY_SERVICE_TOKEN ?? '';
const DRY_RUN = process.argv.includes('--dry-run') || process.env.DRY_RUN === '1';
const DEBUG = process.argv.includes('--debug') || process.env.DEBUG === '1';
const SINGLE = (() => { const i = process.argv.indexOf('--flight'); return i >= 0 ? process.argv[i + 1] : null; })();
// ── Target definitions ────────────────────────────────────────────────────────
interface FlightTarget {
id: string;
airline: string;
route: string;
origin: string;
destination: string;
depDate: string;
searchUrl: string;
bizUrl: string;
monitor: boolean;
}
const ALL_TARGETS: FlightTarget[] = [
// ── LA → CVG (Apr 27 outbound — car pre-positioned in Palmdale, fly from LA) ─
{
id: 'flight-lax-cvg-apr27',
airline: 'any',
route: 'LAX → CVG',
origin: 'LAX',
destination: 'CVG',
depDate: '2026-04-27',
searchUrl: 'https://www.kayak.com/flights/LAX-CVG/2026-04-27?sort=bestflight_a',
bizUrl: 'https://www.kayak.com/flights/LAX-CVG/2026-04-27/business?sort=price_a',
monitor: true,
},
{
id: 'flight-bur-cvg-apr27',
airline: 'any',
route: 'BUR → CVG',
origin: 'BUR',
destination: 'CVG',
depDate: '2026-04-27',
searchUrl: 'https://www.kayak.com/flights/BUR-CVG/2026-04-27?sort=bestflight_a',
bizUrl: 'https://www.kayak.com/flights/BUR-CVG/2026-04-27/business?sort=price_a',
monitor: true,
},
{
id: 'flight-cvg-bur-may4-delta',
airline: 'Delta',
route: 'CVG → BUR',
origin: 'CVG',
destination: 'BUR',
depDate: '2026-05-04',
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-04?sort=price_a',
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-04/business?sort=price_a',
monitor: true,
},
{
id: 'flight-cvg-bur-may1-aa',
airline: 'American',
route: 'CVG → BUR',
origin: 'CVG',
destination: 'BUR',
depDate: '2026-05-01',
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-01?sort=price_a',
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-01/business?sort=price_a',
monitor: true,
},
// ── CVG → LA (May 5 & 6) ─────────────────────────────────────────────────────
{
id: 'flight-cvg-lax-may5',
airline: 'any',
route: 'CVG → LAX',
origin: 'CVG',
destination: 'LAX',
depDate: '2026-05-05',
searchUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-05?sort=price_a',
bizUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-05/business?sort=price_a',
monitor: true,
},
{
id: 'flight-cvg-bur-may5',
airline: 'any',
route: 'CVG → BUR',
origin: 'CVG',
destination: 'BUR',
depDate: '2026-05-05',
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-05?sort=price_a',
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-05/business?sort=price_a',
monitor: true,
},
{
id: 'flight-cvg-lax-may6',
airline: 'any',
route: 'CVG → LAX',
origin: 'CVG',
destination: 'LAX',
depDate: '2026-05-06',
searchUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-06?sort=price_a',
bizUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-06/business?sort=price_a',
monitor: true,
},
{
id: 'flight-cvg-bur-may6',
airline: 'any',
route: 'CVG → BUR',
origin: 'CVG',
destination: 'BUR',
depDate: '2026-05-06',
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-06?sort=price_a',
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-06/business?sort=price_a',
monitor: true,
},
];
// ── Scrape result types ───────────────────────────────────────────────────────
interface ScrapeResult {
target: { id: string; airline: string; route: string };
snapshot: {
price?: number;
bizAvail?: boolean | null;
bizPrice?: number;
scarcity?: string;
depTime?: string;
arrTime?: string;
duration?: string;
scrapedAt: string;
};
}
// ── Kayak scraping ────────────────────────────────────────────────────────────
async function delay(min: number, max: number): Promise<void> {
const ms = min + Math.random() * (max - min);
await new Promise((r) => setTimeout(r, ms));
}
async function waitForResults(page: Page, timeout: number): Promise<boolean> {
try {
await page.waitForFunction(
() => {
const priceEls = document.querySelectorAll('[class*="price"], [class*="Price"]');
for (const el of priceEls) {
if (el.textContent?.includes('$') && /\$\d+/.test(el.textContent ?? '')) return true;
}
const resultItems = document.querySelectorAll('[class*="resultInner"], [class*="result-item"], [data-resultid]');
return resultItems.length > 0;
},
{ timeout },
);
return true;
} catch {
return false;
}
}
async function waitForNoResults(page: Page, timeout: number): Promise<boolean> {
try {
await page.waitForFunction(
() => {
const body = document.body.innerText ?? '';
return /no results/i.test(body) || /no flights/i.test(body) || /couldn.t find/i.test(body);
},
{ timeout },
);
return true;
} catch {
return false;
}
}
interface EconomyData {
price: number | null;
scarcity: string | null;
depTime: string | null;
arrTime: string | null;
duration: string | null;
}
async function scrapeEconomy(page: Page, target: FlightTarget): Promise<EconomyData> {
log.info(`scraping economy`, { id: target.id, url: target.searchUrl });
try {
await page.goto(target.searchUrl, { waitUntil: 'networkidle', timeout: 30_000 });
} catch {
log.debug(`networkidle timed out, falling back to domcontentloaded`, { id: target.id });
try {
await page.goto(target.searchUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 });
} catch (err) {
log.warn(`navigation failed`, { id: target.id, err: String(err) });
return { price: null, scarcity: null, depTime: null, arrTime: null };
}
}
if (DEBUG) {
await page.screenshot({ path: `/tmp/flight-${target.id}-econ-loaded.png`, fullPage: false });
}
const hasResults = await waitForResults(page, 30_000);
if (!hasResults) {
log.warn(`no price elements found for economy`, { id: target.id });
if (DEBUG) {
await page.screenshot({ path: `/tmp/flight-${target.id}-econ-noresult.png`, fullPage: true });
}
return { price: null, scarcity: null, depTime: null, arrTime: null };
}
await delay(1000, 2000);
if (DEBUG) {
await page.screenshot({ path: `/tmp/flight-${target.id}-econ-results.png`, fullPage: false });
}
const data = await page.evaluate((): EconomyData => {
const allText = Array.from(document.querySelectorAll('*'))
.filter((el) => {
const t = el.textContent?.trim() ?? '';
return /^\$\d+$/.test(t) && el.children.length === 0;
})
.map((el) => parseInt((el.textContent ?? '').replace(/\D/g, ''), 10))
.filter((n) => !isNaN(n) && n > 50 && n < 10000);
const price = allText.length > 0 ? Math.min(...allText) : null;
const scarcityMatch = document.body.innerText.match(/(\d+)\s*seats?\s*(left|remaining)/i);
const scarcity = scarcityMatch ? scarcityMatch[0] : null;
let depTime: string | null = null;
let arrTime: string | null = null;
const timeEls = Array.from(document.querySelectorAll('[class*="depart"], [class*="Depart"], [class*="time"], [class*="Time"]'));
if (timeEls.length >= 2) {
const dep = timeEls[0]?.textContent?.trim() ?? null;
const arr = timeEls[1]?.textContent?.trim() ?? null;
if (dep && /\d+:\d+\s*(am|pm)/i.test(dep)) depTime = dep;
if (arr && /\d+:\d+\s*(am|pm)/i.test(arr)) arrTime = arr;
}
const durationMatch = document.body.innerText.match(/(\d+h\s*\d*m?|\d+\s*hr?\s*\d*\s*m(?:in)?)/i);
const duration = durationMatch ? durationMatch[0].trim() : null;
return { price, scarcity, depTime, arrTime, duration };
});
log.info(`economy result`, { id: target.id, price: data.price, scarcity: data.scarcity, duration: data.duration });
return data;
}
interface BizData {
bizAvail: boolean | null;
bizPrice: number | null;
}
async function scrapeBusiness(page: Page, target: FlightTarget): Promise<BizData> {
log.info(`scraping business class`, { id: target.id, url: target.bizUrl });
try {
await page.goto(target.bizUrl, { waitUntil: 'networkidle', timeout: 30_000 });
} catch {
try {
await page.goto(target.bizUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 });
} catch (err) {
log.warn(`biz navigation failed`, { id: target.id, err: String(err) });
return { bizAvail: null, bizPrice: null };
}
}
if (DEBUG) {
await page.screenshot({ path: `/tmp/flight-${target.id}-biz-loaded.png`, fullPage: false });
}
const [hasResults, hasNoResults] = await Promise.all([
waitForResults(page, 10_000),
waitForNoResults(page, 10_000),
]);
if (DEBUG) {
await page.screenshot({ path: `/tmp/flight-${target.id}-biz-final.png`, fullPage: false });
}
if (hasNoResults && !hasResults) {
log.info(`business class: no results`, { id: target.id });
return { bizAvail: false, bizPrice: null };
}
if (hasResults) {
const bizPrice = await page.evaluate((): number | null => {
const allText = Array.from(document.querySelectorAll('*'))
.filter((el) => {
const t = el.textContent?.trim() ?? '';
return /^\$\d+$/.test(t) && el.children.length === 0;
})
.map((el) => parseInt((el.textContent ?? '').replace(/\D/g, ''), 10))
.filter((n) => !isNaN(n) && n > 50 && n < 50000);
return allText.length > 0 ? Math.min(...allText) : null;
});
log.info(`business class available`, { id: target.id, bizPrice });
return { bizAvail: true, bizPrice };
}
log.warn(`business class result uncertain`, { id: target.id });
return { bizAvail: null, bizPrice: null };
}
// ── API calls ─────────────────────────────────────────────────────────────────
async function seedTargets(targets: FlightTarget[]): Promise<void> {
if (DRY_RUN) {
log.info('dry-run: would seed targets', { count: targets.length });
return;
}
const res = await fetch(`${API_BASE}/api/flight-monitor/targets`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${SERVICE_TOKEN}`,
},
body: JSON.stringify({ targets }),
});
if (!res.ok) {
const text = await res.text();
log.warn(`targets seed failed`, { status: res.status, body: text });
} else {
const data = await res.json() as { seeded: number };
log.info(`targets seeded`, { seeded: data.seeded });
}
}
async function postSnapshots(results: ScrapeResult[]): Promise<void> {
if (DRY_RUN) {
log.info('dry-run: would POST snapshots', { count: results.length });
for (const r of results) {
log.info('snapshot', { id: r.target.id, ...r.snapshot });
}
return;
}
const res = await fetch(`${API_BASE}/api/flight-monitor/snapshots`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${SERVICE_TOKEN}`,
},
body: JSON.stringify({ results }),
});
if (!res.ok) {
const text = await res.text();
log.warn(`snapshot POST failed`, { status: res.status, body: text });
} else {
const data = await res.json() as { inserted: number; diff: unknown[] };
log.info(`snapshots saved`, { inserted: data.inserted, changes: data.diff.length });
if (data.diff.length > 0) {
log.info(`price/availability changes detected`, { diff: data.diff });
}
}
}
// ── Main ──────────────────────────────────────────────────────────────────────
const BROWSER_CONTEXT_OPTS = {
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
viewport: { width: 1440, height: 900 },
locale: 'en-US',
timezoneId: 'America/Los_Angeles',
} as const;
const TOR_PROXY = process.env.TOR_PROXY ?? 'socks5://127.0.0.1:9050';
const USE_TOR = process.argv.includes('--tor') || process.env.USE_TOR === '1';
async function closePageSafely(page: Page, label: string): Promise<void> {
try {
await page.close();
} catch (err) {
log.debug(`${label} page.close threw (browser likely terminated by bot detection)`, { err: String(err) });
}
}
async function closeBrowserSafely(browser: Browser): Promise<void> {
try {
await browser.close();
} catch (err) {
log.debug('browser.close threw (process already gone)', { err: String(err) });
}
}
async function main(): Promise<void> {
let targets = ALL_TARGETS;
if (SINGLE) {
targets = targets.filter((t) => t.id === SINGLE);
if (!targets.length) {
log.error(`flight not found`, { id: SINGLE, available: ALL_TARGETS.map((t) => t.id) });
process.exit(1);
}
}
log.info(`starting flight scrape`, { flights: targets.length, dryRun: DRY_RUN, debug: DEBUG });
if (!DRY_RUN && !SERVICE_TOKEN) {
log.error('QUINN_MY_SERVICE_TOKEN env var is required (use --dry-run to skip posting)');
process.exit(1);
}
await seedTargets(targets);
const launchArgs = USE_TOR ? [`--proxy-server=${TOR_PROXY}`] : [];
if (USE_TOR) log.info('routing through Tor', { proxy: TOR_PROXY });
let browser: Browser = await chromium.launch({ headless: !DEBUG, args: launchArgs });
let context: BrowserContext = await browser.newContext(BROWSER_CONTEXT_OPTS);
async function freshPage(): Promise<Page> {
try {
return await context.newPage();
} catch (err) {
log.warn('browser context dead, relaunching', { err: String(err) });
await closeBrowserSafely(browser);
browser = await chromium.launch({ headless: !DEBUG });
context = await browser.newContext(BROWSER_CONTEXT_OPTS);
return await context.newPage();
}
}
const results: ScrapeResult[] = [];
for (const target of targets) {
const scrapedAt = new Date().toISOString();
let econ: EconomyData = { price: null, scarcity: null, depTime: null, arrTime: null };
try {
const econPage = await freshPage();
try {
econ = await scrapeEconomy(econPage, target);
} finally {
await closePageSafely(econPage, 'economy');
}
} catch (err) {
log.warn(`economy scrape threw`, { id: target.id, err: String(err) });
}
await delay(2000, 3500);
let biz: BizData = { bizAvail: null, bizPrice: null };
try {
const bizPage = await freshPage();
try {
biz = await scrapeBusiness(bizPage, target);
} finally {
await closePageSafely(bizPage, 'business');
}
} catch (err) {
log.warn(`biz scrape threw`, { id: target.id, err: String(err) });
}
results.push({
target: { id: target.id, airline: target.airline, route: target.route },
snapshot: {
price: econ.price ?? undefined,
bizAvail: biz.bizAvail,
bizPrice: biz.bizPrice ?? undefined,
scarcity: econ.scarcity ?? undefined,
depTime: econ.depTime ?? undefined,
arrTime: econ.arrTime ?? undefined,
duration: econ.duration ?? undefined,
scrapedAt,
},
});
await delay(1500, 3000);
}
await closeBrowserSafely(browser);
const succeeded = results.filter((r) => r.snapshot.price != null).length;
log.info(`scrape complete`, { succeeded, total: results.length });
await postSnapshots(results);
}
main().catch((err) => { log.error('fatal', { err: String(err) }); process.exit(1); });