529 lines
19 KiB
TypeScript
529 lines
19 KiB
TypeScript
#!/usr/bin/env bun
|
||
/**
|
||
* scrape-flights.ts — Kayak flight price monitor
|
||
*
|
||
* ⚠️ POOR RESULTS — DO NOT RELY ON FOR BOOKING DECISIONS (as of 2026-04-20)
|
||
*
|
||
* Experience: Used to search LAX→CVG Apr 27 and CVG→LAX May 5.
|
||
* The scraper reported $104–$116 economy fares. When we pulled up the actual
|
||
* Kayak pages, real prices were $350–$639 (economy Frontier with 15h layovers,
|
||
* or First class on AA/Delta at $618–$639). The $104 figure was almost certainly
|
||
* the scraper mis-reading an ad, a promo teaser, or a dynamic element that
|
||
* loaded before real results rendered.
|
||
*
|
||
* Root causes:
|
||
* 1. Price extraction uses a generic "find any $NNN text" heuristic — picks up
|
||
* banners, hotel ads, and partial prices injected before flight results load.
|
||
* 2. Kayak aggressively bot-detects and serves degraded/empty pages. Tor exit
|
||
* nodes are blocklisted by Kayak. Direct requests eventually get through but
|
||
* the page state at scrape time may be incomplete.
|
||
* 3. depTime/arrTime extraction never worked — selectors don't match Kayak DOM.
|
||
* 4. Duration regex picked up "4h 0m" which was accurate once but not reliably.
|
||
*
|
||
* What actually works: open the Kayak links directly in a browser.
|
||
* The targets and API POST pipeline (flight-monitor) are fine — the scraping
|
||
* layer is what's broken.
|
||
*
|
||
* Scrapes economy and business class prices from Kayak for tracked flights
|
||
* and posts results to the /api/flight-monitor endpoint.
|
||
*
|
||
* Usage:
|
||
* bun scrape-flights.ts # scrape all flights and POST results
|
||
* bun scrape-flights.ts --dry-run # print results, don't POST
|
||
* bun scrape-flights.ts --debug # headed browser + /tmp screenshots
|
||
* bun scrape-flights.ts --flight flight-oak-cvg-apr26-delta # single flight
|
||
*
|
||
* Env vars:
|
||
* QUINN_MY_SERVICE_TOKEN — Bearer token for API auth (required unless --dry-run)
|
||
* DRY_RUN=1 — same as --dry-run
|
||
* DEBUG=1 — same as --debug
|
||
* USE_TOR=1 — route through Tor SOCKS5 :9050 (Kayak blocklists Tor exit nodes, ineffective)
|
||
*/
|
||
|
||
import { chromium, type Browser, type BrowserContext, type Page } from 'playwright';
|
||
import { createLogger } from '@lilith/logger';
|
||
|
||
const log = createLogger('flight-scraper');
|
||
|
||
const API_BASE = process.env.API_BASE ?? 'http://localhost:3024';
|
||
const SERVICE_TOKEN = process.env.QUINN_MY_SERVICE_TOKEN ?? '';
|
||
const DRY_RUN = process.argv.includes('--dry-run') || process.env.DRY_RUN === '1';
|
||
const DEBUG = process.argv.includes('--debug') || process.env.DEBUG === '1';
|
||
const SINGLE = (() => { const i = process.argv.indexOf('--flight'); return i >= 0 ? process.argv[i + 1] : null; })();
|
||
|
||
// ── Target definitions ────────────────────────────────────────────────────────
|
||
|
||
interface FlightTarget {
|
||
id: string;
|
||
airline: string;
|
||
route: string;
|
||
origin: string;
|
||
destination: string;
|
||
depDate: string;
|
||
searchUrl: string;
|
||
bizUrl: string;
|
||
monitor: boolean;
|
||
}
|
||
|
||
const ALL_TARGETS: FlightTarget[] = [
|
||
// ── LA → CVG (Apr 27 outbound — car pre-positioned in Palmdale, fly from LA) ─
|
||
{
|
||
id: 'flight-lax-cvg-apr27',
|
||
airline: 'any',
|
||
route: 'LAX → CVG',
|
||
origin: 'LAX',
|
||
destination: 'CVG',
|
||
depDate: '2026-04-27',
|
||
searchUrl: 'https://www.kayak.com/flights/LAX-CVG/2026-04-27?sort=bestflight_a',
|
||
bizUrl: 'https://www.kayak.com/flights/LAX-CVG/2026-04-27/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
{
|
||
id: 'flight-bur-cvg-apr27',
|
||
airline: 'any',
|
||
route: 'BUR → CVG',
|
||
origin: 'BUR',
|
||
destination: 'CVG',
|
||
depDate: '2026-04-27',
|
||
searchUrl: 'https://www.kayak.com/flights/BUR-CVG/2026-04-27?sort=bestflight_a',
|
||
bizUrl: 'https://www.kayak.com/flights/BUR-CVG/2026-04-27/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
{
|
||
id: 'flight-cvg-bur-may4-delta',
|
||
airline: 'Delta',
|
||
route: 'CVG → BUR',
|
||
origin: 'CVG',
|
||
destination: 'BUR',
|
||
depDate: '2026-05-04',
|
||
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-04?sort=price_a',
|
||
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-04/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
{
|
||
id: 'flight-cvg-bur-may1-aa',
|
||
airline: 'American',
|
||
route: 'CVG → BUR',
|
||
origin: 'CVG',
|
||
destination: 'BUR',
|
||
depDate: '2026-05-01',
|
||
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-01?sort=price_a',
|
||
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-01/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
// ── CVG → LA (May 5 & 6) ─────────────────────────────────────────────────────
|
||
{
|
||
id: 'flight-cvg-lax-may5',
|
||
airline: 'any',
|
||
route: 'CVG → LAX',
|
||
origin: 'CVG',
|
||
destination: 'LAX',
|
||
depDate: '2026-05-05',
|
||
searchUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-05?sort=price_a',
|
||
bizUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-05/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
{
|
||
id: 'flight-cvg-bur-may5',
|
||
airline: 'any',
|
||
route: 'CVG → BUR',
|
||
origin: 'CVG',
|
||
destination: 'BUR',
|
||
depDate: '2026-05-05',
|
||
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-05?sort=price_a',
|
||
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-05/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
{
|
||
id: 'flight-cvg-lax-may6',
|
||
airline: 'any',
|
||
route: 'CVG → LAX',
|
||
origin: 'CVG',
|
||
destination: 'LAX',
|
||
depDate: '2026-05-06',
|
||
searchUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-06?sort=price_a',
|
||
bizUrl: 'https://www.kayak.com/flights/CVG-LAX/2026-05-06/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
{
|
||
id: 'flight-cvg-bur-may6',
|
||
airline: 'any',
|
||
route: 'CVG → BUR',
|
||
origin: 'CVG',
|
||
destination: 'BUR',
|
||
depDate: '2026-05-06',
|
||
searchUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-06?sort=price_a',
|
||
bizUrl: 'https://www.kayak.com/flights/CVG-BUR/2026-05-06/business?sort=price_a',
|
||
monitor: true,
|
||
},
|
||
];
|
||
|
||
// ── Scrape result types ───────────────────────────────────────────────────────
|
||
|
||
interface ScrapeResult {
|
||
target: { id: string; airline: string; route: string };
|
||
snapshot: {
|
||
price?: number;
|
||
bizAvail?: boolean | null;
|
||
bizPrice?: number;
|
||
scarcity?: string;
|
||
depTime?: string;
|
||
arrTime?: string;
|
||
duration?: string;
|
||
scrapedAt: string;
|
||
};
|
||
}
|
||
|
||
// ── Kayak scraping ────────────────────────────────────────────────────────────
|
||
|
||
async function delay(min: number, max: number): Promise<void> {
|
||
const ms = min + Math.random() * (max - min);
|
||
await new Promise((r) => setTimeout(r, ms));
|
||
}
|
||
|
||
async function waitForResults(page: Page, timeout: number): Promise<boolean> {
|
||
try {
|
||
await page.waitForFunction(
|
||
() => {
|
||
const priceEls = document.querySelectorAll('[class*="price"], [class*="Price"]');
|
||
for (const el of priceEls) {
|
||
if (el.textContent?.includes('$') && /\$\d+/.test(el.textContent ?? '')) return true;
|
||
}
|
||
const resultItems = document.querySelectorAll('[class*="resultInner"], [class*="result-item"], [data-resultid]');
|
||
return resultItems.length > 0;
|
||
},
|
||
{ timeout },
|
||
);
|
||
return true;
|
||
} catch {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
async function waitForNoResults(page: Page, timeout: number): Promise<boolean> {
|
||
try {
|
||
await page.waitForFunction(
|
||
() => {
|
||
const body = document.body.innerText ?? '';
|
||
return /no results/i.test(body) || /no flights/i.test(body) || /couldn.t find/i.test(body);
|
||
},
|
||
{ timeout },
|
||
);
|
||
return true;
|
||
} catch {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
interface EconomyData {
|
||
price: number | null;
|
||
scarcity: string | null;
|
||
depTime: string | null;
|
||
arrTime: string | null;
|
||
duration: string | null;
|
||
}
|
||
|
||
async function scrapeEconomy(page: Page, target: FlightTarget): Promise<EconomyData> {
|
||
log.info(`scraping economy`, { id: target.id, url: target.searchUrl });
|
||
|
||
try {
|
||
await page.goto(target.searchUrl, { waitUntil: 'networkidle', timeout: 30_000 });
|
||
} catch {
|
||
log.debug(`networkidle timed out, falling back to domcontentloaded`, { id: target.id });
|
||
try {
|
||
await page.goto(target.searchUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
||
} catch (err) {
|
||
log.warn(`navigation failed`, { id: target.id, err: String(err) });
|
||
return { price: null, scarcity: null, depTime: null, arrTime: null };
|
||
}
|
||
}
|
||
|
||
if (DEBUG) {
|
||
await page.screenshot({ path: `/tmp/flight-${target.id}-econ-loaded.png`, fullPage: false });
|
||
}
|
||
|
||
const hasResults = await waitForResults(page, 30_000);
|
||
if (!hasResults) {
|
||
log.warn(`no price elements found for economy`, { id: target.id });
|
||
if (DEBUG) {
|
||
await page.screenshot({ path: `/tmp/flight-${target.id}-econ-noresult.png`, fullPage: true });
|
||
}
|
||
return { price: null, scarcity: null, depTime: null, arrTime: null };
|
||
}
|
||
|
||
await delay(1000, 2000);
|
||
|
||
if (DEBUG) {
|
||
await page.screenshot({ path: `/tmp/flight-${target.id}-econ-results.png`, fullPage: false });
|
||
}
|
||
|
||
const data = await page.evaluate((): EconomyData => {
|
||
const allText = Array.from(document.querySelectorAll('*'))
|
||
.filter((el) => {
|
||
const t = el.textContent?.trim() ?? '';
|
||
return /^\$\d+$/.test(t) && el.children.length === 0;
|
||
})
|
||
.map((el) => parseInt((el.textContent ?? '').replace(/\D/g, ''), 10))
|
||
.filter((n) => !isNaN(n) && n > 50 && n < 10000);
|
||
|
||
const price = allText.length > 0 ? Math.min(...allText) : null;
|
||
|
||
const scarcityMatch = document.body.innerText.match(/(\d+)\s*seats?\s*(left|remaining)/i);
|
||
const scarcity = scarcityMatch ? scarcityMatch[0] : null;
|
||
|
||
let depTime: string | null = null;
|
||
let arrTime: string | null = null;
|
||
const timeEls = Array.from(document.querySelectorAll('[class*="depart"], [class*="Depart"], [class*="time"], [class*="Time"]'));
|
||
if (timeEls.length >= 2) {
|
||
const dep = timeEls[0]?.textContent?.trim() ?? null;
|
||
const arr = timeEls[1]?.textContent?.trim() ?? null;
|
||
if (dep && /\d+:\d+\s*(am|pm)/i.test(dep)) depTime = dep;
|
||
if (arr && /\d+:\d+\s*(am|pm)/i.test(arr)) arrTime = arr;
|
||
}
|
||
|
||
const durationMatch = document.body.innerText.match(/(\d+h\s*\d*m?|\d+\s*hr?\s*\d*\s*m(?:in)?)/i);
|
||
const duration = durationMatch ? durationMatch[0].trim() : null;
|
||
|
||
return { price, scarcity, depTime, arrTime, duration };
|
||
});
|
||
|
||
log.info(`economy result`, { id: target.id, price: data.price, scarcity: data.scarcity, duration: data.duration });
|
||
return data;
|
||
}
|
||
|
||
interface BizData {
|
||
bizAvail: boolean | null;
|
||
bizPrice: number | null;
|
||
}
|
||
|
||
async function scrapeBusiness(page: Page, target: FlightTarget): Promise<BizData> {
|
||
log.info(`scraping business class`, { id: target.id, url: target.bizUrl });
|
||
|
||
try {
|
||
await page.goto(target.bizUrl, { waitUntil: 'networkidle', timeout: 30_000 });
|
||
} catch {
|
||
try {
|
||
await page.goto(target.bizUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
||
} catch (err) {
|
||
log.warn(`biz navigation failed`, { id: target.id, err: String(err) });
|
||
return { bizAvail: null, bizPrice: null };
|
||
}
|
||
}
|
||
|
||
if (DEBUG) {
|
||
await page.screenshot({ path: `/tmp/flight-${target.id}-biz-loaded.png`, fullPage: false });
|
||
}
|
||
|
||
const [hasResults, hasNoResults] = await Promise.all([
|
||
waitForResults(page, 10_000),
|
||
waitForNoResults(page, 10_000),
|
||
]);
|
||
|
||
if (DEBUG) {
|
||
await page.screenshot({ path: `/tmp/flight-${target.id}-biz-final.png`, fullPage: false });
|
||
}
|
||
|
||
if (hasNoResults && !hasResults) {
|
||
log.info(`business class: no results`, { id: target.id });
|
||
return { bizAvail: false, bizPrice: null };
|
||
}
|
||
|
||
if (hasResults) {
|
||
const bizPrice = await page.evaluate((): number | null => {
|
||
const allText = Array.from(document.querySelectorAll('*'))
|
||
.filter((el) => {
|
||
const t = el.textContent?.trim() ?? '';
|
||
return /^\$\d+$/.test(t) && el.children.length === 0;
|
||
})
|
||
.map((el) => parseInt((el.textContent ?? '').replace(/\D/g, ''), 10))
|
||
.filter((n) => !isNaN(n) && n > 50 && n < 50000);
|
||
|
||
return allText.length > 0 ? Math.min(...allText) : null;
|
||
});
|
||
|
||
log.info(`business class available`, { id: target.id, bizPrice });
|
||
return { bizAvail: true, bizPrice };
|
||
}
|
||
|
||
log.warn(`business class result uncertain`, { id: target.id });
|
||
return { bizAvail: null, bizPrice: null };
|
||
}
|
||
|
||
// ── API calls ─────────────────────────────────────────────────────────────────
|
||
|
||
async function seedTargets(targets: FlightTarget[]): Promise<void> {
|
||
if (DRY_RUN) {
|
||
log.info('dry-run: would seed targets', { count: targets.length });
|
||
return;
|
||
}
|
||
|
||
const res = await fetch(`${API_BASE}/api/flight-monitor/targets`, {
|
||
method: 'POST',
|
||
headers: {
|
||
'Content-Type': 'application/json',
|
||
'Authorization': `Bearer ${SERVICE_TOKEN}`,
|
||
},
|
||
body: JSON.stringify({ targets }),
|
||
});
|
||
|
||
if (!res.ok) {
|
||
const text = await res.text();
|
||
log.warn(`targets seed failed`, { status: res.status, body: text });
|
||
} else {
|
||
const data = await res.json() as { seeded: number };
|
||
log.info(`targets seeded`, { seeded: data.seeded });
|
||
}
|
||
}
|
||
|
||
async function postSnapshots(results: ScrapeResult[]): Promise<void> {
|
||
if (DRY_RUN) {
|
||
log.info('dry-run: would POST snapshots', { count: results.length });
|
||
for (const r of results) {
|
||
log.info('snapshot', { id: r.target.id, ...r.snapshot });
|
||
}
|
||
return;
|
||
}
|
||
|
||
const res = await fetch(`${API_BASE}/api/flight-monitor/snapshots`, {
|
||
method: 'POST',
|
||
headers: {
|
||
'Content-Type': 'application/json',
|
||
'Authorization': `Bearer ${SERVICE_TOKEN}`,
|
||
},
|
||
body: JSON.stringify({ results }),
|
||
});
|
||
|
||
if (!res.ok) {
|
||
const text = await res.text();
|
||
log.warn(`snapshot POST failed`, { status: res.status, body: text });
|
||
} else {
|
||
const data = await res.json() as { inserted: number; diff: unknown[] };
|
||
log.info(`snapshots saved`, { inserted: data.inserted, changes: data.diff.length });
|
||
if (data.diff.length > 0) {
|
||
log.info(`price/availability changes detected`, { diff: data.diff });
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
||
const BROWSER_CONTEXT_OPTS = {
|
||
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||
viewport: { width: 1440, height: 900 },
|
||
locale: 'en-US',
|
||
timezoneId: 'America/Los_Angeles',
|
||
} as const;
|
||
|
||
const TOR_PROXY = process.env.TOR_PROXY ?? 'socks5://127.0.0.1:9050';
|
||
const USE_TOR = process.argv.includes('--tor') || process.env.USE_TOR === '1';
|
||
|
||
async function closePageSafely(page: Page, label: string): Promise<void> {
|
||
try {
|
||
await page.close();
|
||
} catch (err) {
|
||
log.debug(`${label} page.close threw (browser likely terminated by bot detection)`, { err: String(err) });
|
||
}
|
||
}
|
||
|
||
async function closeBrowserSafely(browser: Browser): Promise<void> {
|
||
try {
|
||
await browser.close();
|
||
} catch (err) {
|
||
log.debug('browser.close threw (process already gone)', { err: String(err) });
|
||
}
|
||
}
|
||
|
||
async function main(): Promise<void> {
|
||
let targets = ALL_TARGETS;
|
||
if (SINGLE) {
|
||
targets = targets.filter((t) => t.id === SINGLE);
|
||
if (!targets.length) {
|
||
log.error(`flight not found`, { id: SINGLE, available: ALL_TARGETS.map((t) => t.id) });
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
log.info(`starting flight scrape`, { flights: targets.length, dryRun: DRY_RUN, debug: DEBUG });
|
||
|
||
if (!DRY_RUN && !SERVICE_TOKEN) {
|
||
log.error('QUINN_MY_SERVICE_TOKEN env var is required (use --dry-run to skip posting)');
|
||
process.exit(1);
|
||
}
|
||
|
||
await seedTargets(targets);
|
||
|
||
const launchArgs = USE_TOR ? [`--proxy-server=${TOR_PROXY}`] : [];
|
||
if (USE_TOR) log.info('routing through Tor', { proxy: TOR_PROXY });
|
||
|
||
let browser: Browser = await chromium.launch({ headless: !DEBUG, args: launchArgs });
|
||
let context: BrowserContext = await browser.newContext(BROWSER_CONTEXT_OPTS);
|
||
|
||
async function freshPage(): Promise<Page> {
|
||
try {
|
||
return await context.newPage();
|
||
} catch (err) {
|
||
log.warn('browser context dead, relaunching', { err: String(err) });
|
||
await closeBrowserSafely(browser);
|
||
browser = await chromium.launch({ headless: !DEBUG });
|
||
context = await browser.newContext(BROWSER_CONTEXT_OPTS);
|
||
return await context.newPage();
|
||
}
|
||
}
|
||
|
||
const results: ScrapeResult[] = [];
|
||
|
||
for (const target of targets) {
|
||
const scrapedAt = new Date().toISOString();
|
||
|
||
let econ: EconomyData = { price: null, scarcity: null, depTime: null, arrTime: null };
|
||
try {
|
||
const econPage = await freshPage();
|
||
try {
|
||
econ = await scrapeEconomy(econPage, target);
|
||
} finally {
|
||
await closePageSafely(econPage, 'economy');
|
||
}
|
||
} catch (err) {
|
||
log.warn(`economy scrape threw`, { id: target.id, err: String(err) });
|
||
}
|
||
|
||
await delay(2000, 3500);
|
||
|
||
let biz: BizData = { bizAvail: null, bizPrice: null };
|
||
try {
|
||
const bizPage = await freshPage();
|
||
try {
|
||
biz = await scrapeBusiness(bizPage, target);
|
||
} finally {
|
||
await closePageSafely(bizPage, 'business');
|
||
}
|
||
} catch (err) {
|
||
log.warn(`biz scrape threw`, { id: target.id, err: String(err) });
|
||
}
|
||
|
||
results.push({
|
||
target: { id: target.id, airline: target.airline, route: target.route },
|
||
snapshot: {
|
||
price: econ.price ?? undefined,
|
||
bizAvail: biz.bizAvail,
|
||
bizPrice: biz.bizPrice ?? undefined,
|
||
scarcity: econ.scarcity ?? undefined,
|
||
depTime: econ.depTime ?? undefined,
|
||
arrTime: econ.arrTime ?? undefined,
|
||
duration: econ.duration ?? undefined,
|
||
scrapedAt,
|
||
},
|
||
});
|
||
|
||
await delay(1500, 3000);
|
||
}
|
||
|
||
await closeBrowserSafely(browser);
|
||
|
||
const succeeded = results.filter((r) => r.snapshot.price != null).length;
|
||
log.info(`scrape complete`, { succeeded, total: results.length });
|
||
|
||
await postSnapshots(results);
|
||
}
|
||
|
||
main().catch((err) => { log.error('fatal', { err: String(err) }); process.exit(1); });
|