lilith-platform.live/codebase/@features/my/scrape-scripts/scrape-hotels.ts
2026-05-14 18:56:09 -07:00

236 lines
9.8 KiB
TypeScript

#!/usr/bin/env bun
/**
* scrape-hotels.ts — Hotels.com rate refresher for calendar HTML
*
* Reads all direct ho* hotel links from the calendar, navigates each with
* Playwright, extracts the cheapest available room, and patches h-note spans.
*
* Usage:
* bun scrape-hotels.ts # update HTML in place
* bun scrape-hotels.ts --dry-run # print results, no file write
* bun scrape-hotels.ts --debug # headed browser + /tmp screenshots
* bun scrape-hotels.ts --hotel ho112914 # single hotel only
*
* Docker env vars:
* HTML_FILE=/work/calendar.html # override file path (default: sibling file)
* DRY_RUN=1 # same as --dry-run
* DEBUG=1 # same as --debug
*/
import { chromium, type Browser, type Page } from 'playwright';
import { readFileSync, writeFileSync } from 'fs';
import { resolve } from 'path';
import { createLogger } from '@lilith/logger';
const log = createLogger('hotel-scraper');
const HTML_FILE = process.env.HTML_FILE ?? resolve(import.meta.dir, 'calendar-2026-apr-may-june.html');
const DRY_RUN = process.argv.includes('--dry-run') || process.env.DRY_RUN === '1';
const DEBUG = process.argv.includes('--debug') || process.env.DEBUG === '1';
const SINGLE = (() => { const i = process.argv.indexOf('--hotel'); return i >= 0 ? process.argv[i + 1] : null; })();
// ── Types ────────────────────────────────────────────────────────────────────
interface HotelLink {
url: string;
hotelId: string;
checkIn: string;
checkOut: string;
name: string;
}
interface RoomResult {
roomName: string;
sqft: number | null;
nightly: number;
total: number;
availability: string | null;
deadline: string | null;
}
// ── HTML parsing ─────────────────────────────────────────────────────────────
function parseLinks(html: string): HotelLink[] {
const pattern = /href="(https:\/\/www\.hotels\.com\/(ho\d+)\/[^"]+?chkin=(\d{4}-\d{2}-\d{2})&(?:amp;)?chkout=(\d{4}-\d{2}-\d{2})[^"]*)"[^>]*>\s*<span class="h-name">([^<]+)/g;
return [...html.matchAll(pattern)].map(m => ({
url: m[1],
hotelId: m[2],
checkIn: m[3],
checkOut: m[4],
name: m[5].trim(),
}));
}
// ── Scraping ─────────────────────────────────────────────────────────────────
// Hotels.com (Expedia PWA) selectors — 2025/2026 design system.
// Run with --debug to capture /tmp screenshots if these miss.
const SEL = {
roomCard: '[data-stid="rooms-and-rates-room-card"], [data-stid="room-card"], [data-testid="room-card"], .uitk-card[data-room-id]',
roomName: '[data-stid="rooms-and-rates-room-name"], [data-testid="room-name"], h3',
roomSqft: '[data-stid="rooms-and-rates-room-size"], [data-testid="room-size"]',
nightly: '[data-stid="price-summary-lead-price"], [data-testid="price-display-value"], .uitk-lockup-price',
total: '[data-stid="price-summary-total-price"], [data-testid="total-price"], .uitk-lockup-sub-info',
availability: '[data-stid="rooms-and-rates-remaining-rooms"], [data-testid="remaining-rooms"]',
urgency: '[data-stid="urgency-message"], [data-testid="urgency-message"], .uitk-badge-standard',
} as const;
async function scrapeHotel(page: Page, link: HotelLink): Promise<RoomResult | null> {
log.info(`scraping ${link.hotelId}`, { name: link.name, checkIn: link.checkIn, checkOut: link.checkOut });
try {
await page.goto(link.url, { waitUntil: 'networkidle', timeout: 30_000 });
} catch {
await page.goto(link.url, { waitUntil: 'domcontentloaded', timeout: 30_000 });
await page.waitForTimeout(4_000);
}
if (DEBUG) {
const shot = `/tmp/hotel-${link.hotelId}-loaded.png`;
await page.screenshot({ path: shot, fullPage: false });
log.debug(`screenshot saved`, { path: shot });
}
const cardHandle = await page.waitForSelector(SEL.roomCard, { timeout: 15_000 }).catch(() => null);
if (!cardHandle) {
log.warn(`room cards not found — selectors may need updating`, { hotelId: link.hotelId });
if (DEBUG) {
const shot = `/tmp/hotel-${link.hotelId}-no-rooms.png`;
await page.screenshot({ path: shot, fullPage: true });
const headings = await page.$$eval('h1, h2, h3', els =>
els.map(e => e.textContent?.trim()).filter(Boolean).slice(0, 10)
);
log.debug(`page headings`, { shot, headings });
}
return null;
}
const result = await page.evaluate((sel) => {
const cards = [...document.querySelectorAll(sel.roomCard)];
if (!cards.length) return null;
const qs = (el: Element, s: string) => el.querySelector(s)?.textContent?.trim() ?? null;
const toNum = (t: string | null) => t ? parseFloat(t.replace(/[^0-9.]/g, '')) || 0 : 0;
const rooms = cards.map(card => {
const sqftText = qs(card, sel.roomSqft);
const sqftMatch = sqftText?.match(/(\d+)\s*sq\s*ft/i);
return {
roomName: qs(card, sel.roomName) ?? '',
sqft: sqftMatch ? parseInt(sqftMatch[1], 10) : null,
nightly: toNum(qs(card, sel.nightly)),
total: toNum(qs(card, sel.total)),
availability: qs(card, sel.availability),
deadline: qs(card, sel.urgency),
};
});
const valid = rooms.filter(r => r.nightly > 0);
if (!valid.length) return null;
return valid.sort((a, b) => a.nightly - b.nightly)[0];
}, SEL);
if (!result) {
log.warn(`price extraction failed — page structure may have changed`, { hotelId: link.hotelId });
if (DEBUG) await page.screenshot({ path: `/tmp/hotel-${link.hotelId}-no-price.png`, fullPage: true });
return null;
}
log.info(`extracted room`, { hotelId: link.hotelId, room: result.roomName, nightly: result.nightly, total: result.total });
return result;
}
// ── HTML patching ─────────────────────────────────────────────────────────────
function nightCount(checkIn: string, checkOut: string): number {
return Math.round(
(new Date(checkOut).getTime() - new Date(checkIn).getTime()) / 86_400_000
);
}
function buildNote(room: RoomResult, link: HotelLink): string {
const n = nightCount(link.checkIn, link.checkOut);
const parts: string[] = [];
if (room.sqft) parts.push(`${room.sqft} sq ft`);
if (room.nightly) parts.push(`$${room.nightly}/n`);
if (room.total) parts.push(`$${room.total.toLocaleString()} total (${n} nights)`);
if (room.availability) parts.push(`${room.availability}`);
if (room.deadline) parts.push(room.deadline.replace(/^before\s+/i, 'book before '));
return parts.join(' · ');
}
function patchHtml(html: string, link: HotelLink, room: RoomResult): string {
const note = buildNote(room, link);
const escapedUrl = link.url.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const anchorRe = new RegExp(
`(href="${escapedUrl}"[\\s\\S]*?<span class="h-name">[^<]+?)(?:<span class="h-note">[^<]*<\\/span>)?`
);
const patched = html.replace(anchorRe, `$1<span class="h-note">${note}</span>`);
if (patched === html) log.warn(`h-name anchor not matched`, { hotelId: link.hotelId });
return patched;
}
// ── Main ──────────────────────────────────────────────────────────────────────
async function main(): Promise<void> {
let html = readFileSync(HTML_FILE, 'utf8');
let links = parseLinks(html);
if (!links.length) {
log.error('no direct ho* hotel links found — check URL format in HTML');
process.exit(1);
}
if (SINGLE) links = links.filter(l => l.hotelId === SINGLE);
log.info(`starting scrape`, { hotels: links.length, dryRun: DRY_RUN, debug: DEBUG });
links.forEach(l => log.info(`hotel`, { id: l.hotelId, name: l.name, checkIn: l.checkIn, checkOut: l.checkOut }));
const browser: Browser = await chromium.launch({ headless: !DEBUG });
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
viewport: { width: 1440, height: 900 },
locale: 'en-US',
timezoneId: 'America/Los_Angeles',
});
const results: { link: HotelLink; room: RoomResult | null }[] = [];
for (const link of links) {
const page = await context.newPage();
try {
const room = await scrapeHotel(page, link);
results.push({ link, room });
if (room && !DRY_RUN) html = patchHtml(html, link, room);
} catch (err) {
log.error(`hotel scrape threw`, { hotelId: link.hotelId, err: String(err) });
results.push({ link, room: null });
} finally {
await page.close();
}
await new Promise(r => setTimeout(r, 1_500 + Math.random() * 1_500));
}
await browser.close();
const succeeded = results.filter(r => r.room !== null).length;
const failed = results.length - succeeded;
log.info(`scrape complete`, { succeeded, failed });
for (const { link, room } of results) {
if (room) {
log.info(`result`, { id: link.hotelId, nightly: room.nightly, total: room.total });
} else {
log.warn(`no data`, { id: link.hotelId, name: link.name });
}
}
if (!DRY_RUN) {
writeFileSync(HTML_FILE, html, 'utf8');
log.info(`HTML updated`, { path: HTML_FILE });
} else {
log.info('dry-run complete — no changes written');
}
}
main().catch(err => { log.error('fatal', { err: String(err) }); process.exit(1); });