236 lines
9.8 KiB
TypeScript
236 lines
9.8 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* scrape-hotels.ts — Hotels.com rate refresher for calendar HTML
|
|
*
|
|
* Reads all direct ho* hotel links from the calendar, navigates each with
|
|
* Playwright, extracts the cheapest available room, and patches h-note spans.
|
|
*
|
|
* Usage:
|
|
* bun scrape-hotels.ts # update HTML in place
|
|
* bun scrape-hotels.ts --dry-run # print results, no file write
|
|
* bun scrape-hotels.ts --debug # headed browser + /tmp screenshots
|
|
* bun scrape-hotels.ts --hotel ho112914 # single hotel only
|
|
*
|
|
* Docker env vars:
|
|
* HTML_FILE=/work/calendar.html # override file path (default: sibling file)
|
|
* DRY_RUN=1 # same as --dry-run
|
|
* DEBUG=1 # same as --debug
|
|
*/
|
|
|
|
import { chromium, type Browser, type Page } from 'playwright';
|
|
import { readFileSync, writeFileSync } from 'fs';
|
|
import { resolve } from 'path';
|
|
import { createLogger } from '@lilith/logger';
|
|
|
|
const log = createLogger('hotel-scraper');
|
|
|
|
const HTML_FILE = process.env.HTML_FILE ?? resolve(import.meta.dir, 'calendar-2026-apr-may-june.html');
|
|
const DRY_RUN = process.argv.includes('--dry-run') || process.env.DRY_RUN === '1';
|
|
const DEBUG = process.argv.includes('--debug') || process.env.DEBUG === '1';
|
|
const SINGLE = (() => { const i = process.argv.indexOf('--hotel'); return i >= 0 ? process.argv[i + 1] : null; })();
|
|
|
|
// ── Types ────────────────────────────────────────────────────────────────────
|
|
|
|
interface HotelLink {
|
|
url: string;
|
|
hotelId: string;
|
|
checkIn: string;
|
|
checkOut: string;
|
|
name: string;
|
|
}
|
|
|
|
interface RoomResult {
|
|
roomName: string;
|
|
sqft: number | null;
|
|
nightly: number;
|
|
total: number;
|
|
availability: string | null;
|
|
deadline: string | null;
|
|
}
|
|
|
|
// ── HTML parsing ─────────────────────────────────────────────────────────────
|
|
|
|
function parseLinks(html: string): HotelLink[] {
|
|
const pattern = /href="(https:\/\/www\.hotels\.com\/(ho\d+)\/[^"]+?chkin=(\d{4}-\d{2}-\d{2})&(?:amp;)?chkout=(\d{4}-\d{2}-\d{2})[^"]*)"[^>]*>\s*<span class="h-name">([^<]+)/g;
|
|
return [...html.matchAll(pattern)].map(m => ({
|
|
url: m[1],
|
|
hotelId: m[2],
|
|
checkIn: m[3],
|
|
checkOut: m[4],
|
|
name: m[5].trim(),
|
|
}));
|
|
}
|
|
|
|
// ── Scraping ─────────────────────────────────────────────────────────────────
|
|
|
|
// Hotels.com (Expedia PWA) selectors — 2025/2026 design system.
|
|
// Run with --debug to capture /tmp screenshots if these miss.
|
|
const SEL = {
|
|
roomCard: '[data-stid="rooms-and-rates-room-card"], [data-stid="room-card"], [data-testid="room-card"], .uitk-card[data-room-id]',
|
|
roomName: '[data-stid="rooms-and-rates-room-name"], [data-testid="room-name"], h3',
|
|
roomSqft: '[data-stid="rooms-and-rates-room-size"], [data-testid="room-size"]',
|
|
nightly: '[data-stid="price-summary-lead-price"], [data-testid="price-display-value"], .uitk-lockup-price',
|
|
total: '[data-stid="price-summary-total-price"], [data-testid="total-price"], .uitk-lockup-sub-info',
|
|
availability: '[data-stid="rooms-and-rates-remaining-rooms"], [data-testid="remaining-rooms"]',
|
|
urgency: '[data-stid="urgency-message"], [data-testid="urgency-message"], .uitk-badge-standard',
|
|
} as const;
|
|
|
|
async function scrapeHotel(page: Page, link: HotelLink): Promise<RoomResult | null> {
|
|
log.info(`scraping ${link.hotelId}`, { name: link.name, checkIn: link.checkIn, checkOut: link.checkOut });
|
|
|
|
try {
|
|
await page.goto(link.url, { waitUntil: 'networkidle', timeout: 30_000 });
|
|
} catch {
|
|
await page.goto(link.url, { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
|
await page.waitForTimeout(4_000);
|
|
}
|
|
|
|
if (DEBUG) {
|
|
const shot = `/tmp/hotel-${link.hotelId}-loaded.png`;
|
|
await page.screenshot({ path: shot, fullPage: false });
|
|
log.debug(`screenshot saved`, { path: shot });
|
|
}
|
|
|
|
const cardHandle = await page.waitForSelector(SEL.roomCard, { timeout: 15_000 }).catch(() => null);
|
|
if (!cardHandle) {
|
|
log.warn(`room cards not found — selectors may need updating`, { hotelId: link.hotelId });
|
|
if (DEBUG) {
|
|
const shot = `/tmp/hotel-${link.hotelId}-no-rooms.png`;
|
|
await page.screenshot({ path: shot, fullPage: true });
|
|
const headings = await page.$$eval('h1, h2, h3', els =>
|
|
els.map(e => e.textContent?.trim()).filter(Boolean).slice(0, 10)
|
|
);
|
|
log.debug(`page headings`, { shot, headings });
|
|
}
|
|
return null;
|
|
}
|
|
|
|
const result = await page.evaluate((sel) => {
|
|
const cards = [...document.querySelectorAll(sel.roomCard)];
|
|
if (!cards.length) return null;
|
|
|
|
const qs = (el: Element, s: string) => el.querySelector(s)?.textContent?.trim() ?? null;
|
|
const toNum = (t: string | null) => t ? parseFloat(t.replace(/[^0-9.]/g, '')) || 0 : 0;
|
|
|
|
const rooms = cards.map(card => {
|
|
const sqftText = qs(card, sel.roomSqft);
|
|
const sqftMatch = sqftText?.match(/(\d+)\s*sq\s*ft/i);
|
|
return {
|
|
roomName: qs(card, sel.roomName) ?? '',
|
|
sqft: sqftMatch ? parseInt(sqftMatch[1], 10) : null,
|
|
nightly: toNum(qs(card, sel.nightly)),
|
|
total: toNum(qs(card, sel.total)),
|
|
availability: qs(card, sel.availability),
|
|
deadline: qs(card, sel.urgency),
|
|
};
|
|
});
|
|
|
|
const valid = rooms.filter(r => r.nightly > 0);
|
|
if (!valid.length) return null;
|
|
return valid.sort((a, b) => a.nightly - b.nightly)[0];
|
|
}, SEL);
|
|
|
|
if (!result) {
|
|
log.warn(`price extraction failed — page structure may have changed`, { hotelId: link.hotelId });
|
|
if (DEBUG) await page.screenshot({ path: `/tmp/hotel-${link.hotelId}-no-price.png`, fullPage: true });
|
|
return null;
|
|
}
|
|
|
|
log.info(`extracted room`, { hotelId: link.hotelId, room: result.roomName, nightly: result.nightly, total: result.total });
|
|
return result;
|
|
}
|
|
|
|
// ── HTML patching ─────────────────────────────────────────────────────────────
|
|
|
|
function nightCount(checkIn: string, checkOut: string): number {
|
|
return Math.round(
|
|
(new Date(checkOut).getTime() - new Date(checkIn).getTime()) / 86_400_000
|
|
);
|
|
}
|
|
|
|
function buildNote(room: RoomResult, link: HotelLink): string {
|
|
const n = nightCount(link.checkIn, link.checkOut);
|
|
const parts: string[] = [];
|
|
if (room.sqft) parts.push(`${room.sqft} sq ft`);
|
|
if (room.nightly) parts.push(`$${room.nightly}/n`);
|
|
if (room.total) parts.push(`$${room.total.toLocaleString()} total (${n} nights)`);
|
|
if (room.availability) parts.push(`⚠ ${room.availability}`);
|
|
if (room.deadline) parts.push(room.deadline.replace(/^before\s+/i, 'book before '));
|
|
return parts.join(' · ');
|
|
}
|
|
|
|
function patchHtml(html: string, link: HotelLink, room: RoomResult): string {
|
|
const note = buildNote(room, link);
|
|
const escapedUrl = link.url.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
const anchorRe = new RegExp(
|
|
`(href="${escapedUrl}"[\\s\\S]*?<span class="h-name">[^<]+?)(?:<span class="h-note">[^<]*<\\/span>)?`
|
|
);
|
|
const patched = html.replace(anchorRe, `$1<span class="h-note">${note}</span>`);
|
|
if (patched === html) log.warn(`h-name anchor not matched`, { hotelId: link.hotelId });
|
|
return patched;
|
|
}
|
|
|
|
// ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
async function main(): Promise<void> {
|
|
let html = readFileSync(HTML_FILE, 'utf8');
|
|
let links = parseLinks(html);
|
|
|
|
if (!links.length) {
|
|
log.error('no direct ho* hotel links found — check URL format in HTML');
|
|
process.exit(1);
|
|
}
|
|
|
|
if (SINGLE) links = links.filter(l => l.hotelId === SINGLE);
|
|
|
|
log.info(`starting scrape`, { hotels: links.length, dryRun: DRY_RUN, debug: DEBUG });
|
|
links.forEach(l => log.info(`hotel`, { id: l.hotelId, name: l.name, checkIn: l.checkIn, checkOut: l.checkOut }));
|
|
|
|
const browser: Browser = await chromium.launch({ headless: !DEBUG });
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
|
viewport: { width: 1440, height: 900 },
|
|
locale: 'en-US',
|
|
timezoneId: 'America/Los_Angeles',
|
|
});
|
|
|
|
const results: { link: HotelLink; room: RoomResult | null }[] = [];
|
|
|
|
for (const link of links) {
|
|
const page = await context.newPage();
|
|
try {
|
|
const room = await scrapeHotel(page, link);
|
|
results.push({ link, room });
|
|
if (room && !DRY_RUN) html = patchHtml(html, link, room);
|
|
} catch (err) {
|
|
log.error(`hotel scrape threw`, { hotelId: link.hotelId, err: String(err) });
|
|
results.push({ link, room: null });
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
await new Promise(r => setTimeout(r, 1_500 + Math.random() * 1_500));
|
|
}
|
|
|
|
await browser.close();
|
|
|
|
const succeeded = results.filter(r => r.room !== null).length;
|
|
const failed = results.length - succeeded;
|
|
log.info(`scrape complete`, { succeeded, failed });
|
|
|
|
for (const { link, room } of results) {
|
|
if (room) {
|
|
log.info(`result`, { id: link.hotelId, nightly: room.nightly, total: room.total });
|
|
} else {
|
|
log.warn(`no data`, { id: link.hotelId, name: link.name });
|
|
}
|
|
}
|
|
|
|
if (!DRY_RUN) {
|
|
writeFileSync(HTML_FILE, html, 'utf8');
|
|
log.info(`HTML updated`, { path: HTML_FILE });
|
|
} else {
|
|
log.info('dry-run complete — no changes written');
|
|
}
|
|
}
|
|
|
|
main().catch(err => { log.error('fatal', { err: String(err) }); process.exit(1); });
|