feat(box): OCR extraction + GPU (OpenAI-compatible) rating backend, env-selectable

Wire the on-box (Claude-API-less) path decided with the operator: EXTRACT_BACKEND=ocr sends each screenshot to the on-box mrnumber-ocr service (raw text, no per-shot structuring); build_rating_profile uses an OpenAI-compatible LLM on a DO GPU droplet (RATING_LLM_URL) which extracts the reports from the raw OCR text AND produces the multi-axis verdict. Reports are folded back into the history so the people-signal + counts + safety flags reflect them; safety detection also scans the raw OCR lines so a LE term forces cop_flag even before structuring. vision/Claude stays the plum-dev default. +5 tests incl. full OCR→GPU→cop_flag flow. 33/33. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 00:39:06 -04:00 · 2026-06-30 00:39:06 -04:00 · 15e7348413
commit 15e7348413
parent 9029f3789c
2 changed files with 173 additions and 20 deletions
--- a/client/mr_lookup.py
+++ b/client/mr_lookup.py
@ -41,12 +41,18 @@ from typing import Any
 from redroid_client import (
    RedroidDevice,
    clean_phone as _clean_phone,
+    extract_backend,
    extract_screenshot,
    json_mode,
    load_sdk,
    log,
+    ocr_extract,
+    ocr_url,
+    openai_chat,
    people_base_url,
    people_service_token,
+    rating_llm_model,
+    rating_llm_url,
    record_people_signal,
    set_json_mode,
 )
@ -104,6 +110,14 @@ def _build_vision_prompt(screenshot_path: str, phone: str) -> str:


 async def _extract_from_screenshot(screenshot_path: str, phone: str) -> dict[str, Any]:
+    """Per-screenshot extraction. With EXTRACT_BACKEND=vision (plum dev) Claude returns a
+    structured report dict. With EXTRACT_BACKEND=ocr (the box) the on-box tesseract service
+    returns raw screen text — there is no per-shot structuring; the rating LLM does the
+    extraction + reasoning from the concatenated OCR text downstream."""
+    if extract_backend() == "ocr":
+        payload = ocr_extract(str(screenshot_path), base_url=ocr_url())
+        return {"reports": [], "red_flags": [], "classification": None,
+                "report_count": None, "raw_ocr": payload.get("text", "")}
    return await extract_screenshot(
        screenshot_path=str(screenshot_path),
        system=MR_NUMBER_SYSTEM,
@ -120,10 +134,14 @@ def merge_reports(extractions: list[dict[str, Any]], phone: str) -> dict[str, An
    red_seen: set[str] = set()
    classification: str | None = None
    declared_count = 0
+    ocr_chunks: list[str] = []

    for ex in extractions:
        if not isinstance(ex, dict):
            continue
+        chunk = (ex.get("raw_ocr") or "").strip()
+        if chunk:
+            ocr_chunks.append(chunk)
        if not classification and ex.get("classification"):
            classification = ex.get("classification")
        rc = ex.get("report_count")
@ -140,18 +158,24 @@ def merge_reports(extractions: list[dict[str, Any]], phone: str) -> dict[str, An
                red_seen.add(key)
                red_flags.append(str(f).strip())

+    ocr_text = "\n".join(ocr_chunks)
+    # Safety flags are deterministic keyword/regex over text, so for the OCR backend we
+    # scan the raw OCR lines too (the LE/violence signal must be caught even before the
+    # rating LLM structures the reports).
+    safety_inputs = reports + [ln.strip() for ln in ocr_text.splitlines() if ln.strip()]
    return {
        "phone": phone,
        "reports": reports,
        "red_flags": red_flags,
        "classification": classification,
+        "ocr_text": ocr_text,
        # report_count = the larger of what the app declared vs. how many we captured
        "report_count": max(declared_count, len(reports)),
        "captured_count": len(reports),
        "declared_count": declared_count,
        # Critical safety signals, promoted OUT of the flat report/flag lists so a
        # human (and the verdict) never has to find them buried in row 7 of 14.
-        "safety_flags": detect_safety_flags(reports, red_flags),
+        "safety_flags": detect_safety_flags(safety_inputs, red_flags),
    }


@ -295,13 +319,13 @@ def _build_rating_prompt(history: dict[str, Any]) -> str:
            "respect": {"score": "0-100", "note": "politeness, respects boundaries, not pushy"},
            "safety": {"score": "0-100", "note": "no law-enforcement/violence/coercion signals"},
        },
+        "reports": "array of strings — the verbatim community report texts (extract them from the raw OCR when given raw text)",
        "positive_signals": "array of strings — concrete positives found (quote/paraphrase the report)",
        "negative_signals": "array of strings — concrete negatives found",
        "nuanced_notes": "array of strings — where you read a signal NON-literally (e.g. deposit mentions as positive)",
        "summary": "2-3 sentence consolidated profile of this caller",
        "recommended_result": "one of: approved, denied, pending, not_found",
    }
-    reports_block = "\n".join(f"- {r}" for r in history.get("reports") or []) or "(no report text captured)"
    safety_flags = history.get("safety_flags") or []
    safety_block = ""
    if safety_flags:
@ -310,38 +334,55 @@ def _build_rating_prompt(history: dict[str, Any]) -> str:
            f"PROMOTED CRITICAL SAFETY FLAGS (already detected — score the safety axis "
            f"at/near 0 and recommend denied): {promoted}\n\n"
        )
+    # OCR backend gives raw screen text (no pre-structured reports) — ask the model to
+    # extract the reports first; the vision backend already provides a clean report list.
+    ocr_text = (history.get("ocr_text") or "").strip()
+    if not (history.get("reports")) and ocr_text:
+        content_block = (
+            "Raw OCR text of the Mr. Number report screen(s) — noisy (UI chrome, partial "
+            "lines). FIRST extract the genuine community report texts (ignore buttons, "
+            "headers, nav), then rate:\n"
+            f"<<<OCR\n{ocr_text}\nOCR>>>\n\n"
+        )
+    else:
+        reports_block = "\n".join(f"- {r}" for r in history.get("reports") or []) or "(no report text captured)"
+        content_block = f"All captured community reports:\n{reports_block}\n\n"
    return (
        f"Caller: {history.get('phone')}\n"
        f"App classification: {history.get('classification')}\n"
        f"Reports the app says exist: {history.get('report_count')} "
        f"(captured {history.get('captured_count')})\n\n"
-        f"All captured community reports:\n{reports_block}\n\n"
+        f"{content_block}"
        f"{safety_block}"
-        f"Vision-flagged terms: {', '.join(history.get('red_flags') or []) or '(none)'}\n\n"
+        f"Flagged terms: {', '.join(history.get('red_flags') or []) or '(none)'}\n\n"
        "Produce the caller's rating profile. Apply the domain nuance from the system prompt "
        "(especially: deposits are a positive signal; law-enforcement signals force denied). "
        f"Respond with ONLY one JSON object:\n{json.dumps(schema, indent=2)}"
    )


-async def build_rating_profile(history: dict[str, Any]) -> dict[str, Any] | None:
-    """Consolidate the full report history into a multi-axis rating profile via the SDK."""
-    if not (history.get("reports")):
+def _extract_json(text: str) -> dict[str, Any] | None:
+    """Pull the first JSON object out of an LLM response (handles ```json fences / prose)."""
+    if not text:
        return None
-    ClaudeClient, parse_json_response = load_sdk()
-    client = ClaudeClient(model=RATING_MODEL, max_concurrent=1)
-    resp = await client.generate(
-        system=RATING_SYSTEM,
-        user=_build_rating_prompt(history),
-        cwd=str(OUTPUT_DIR),
-        allowed_tools=[],
-    )
-    if not resp:
+    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
+    candidate = fence.group(1) if fence else None
+    if candidate is None:
+        start = text.find("{")
+        end = text.rfind("}")
+        candidate = text[start:end + 1] if start != -1 and end > start else None
+    if not candidate:
        return None
-    parsed = parse_json_response(resp)
+    try:
+        obj = json.loads(candidate)
+        return obj if isinstance(obj, dict) else None
+    except json.JSONDecodeError:
+        return None
+
+
+def _normalize_profile(parsed: dict[str, Any] | None) -> dict[str, Any] | None:
    if not isinstance(parsed, dict):
        return None
-    # Normalize: ensure score is an int and grade is consistent with it.
    score = parsed.get("score")
    if isinstance(score, (int, float)):
        parsed["score"] = int(score)
@ -350,6 +391,28 @@ async def build_rating_profile(history: dict[str, Any]) -> dict[str, Any] | None
    return parsed


+async def build_rating_profile(history: dict[str, Any]) -> dict[str, Any] | None:
+    """Consolidate the report history into a multi-axis rating profile. Backend is
+    env-selected: RATING_LLM_URL set → an OpenAI-compatible LLM on the DO GPU droplet
+    (also does report extraction from raw OCR); else the Claude batch SDK (plum dev)."""
+    if not (history.get("reports") or history.get("ocr_text")):
+        return None
+    prompt = _build_rating_prompt(history)
+
+    gpu_url = rating_llm_url()
+    if gpu_url:
+        content = openai_chat(base_url=gpu_url, model=rating_llm_model(),
+                              system=RATING_SYSTEM, user=prompt)
+        return _normalize_profile(_extract_json(content))
+
+    ClaudeClient, parse_json_response = load_sdk()
+    client = ClaudeClient(model=RATING_MODEL, max_concurrent=1)
+    resp = await client.generate(system=RATING_SYSTEM, user=prompt, cwd=str(OUTPUT_DIR), allowed_tools=[])
+    if not resp:
+        return None
+    return _normalize_profile(parse_json_response(resp))
+
+
 def grade_from_score(score: int | float | None) -> str:
    if score is None:
        return "?"
@ -686,6 +749,10 @@ async def main_async(phone: str, ref: str | None, dry_run: bool, dump_ui: bool =
        ex = await _extract_from_screenshot(str(shot), phone)
        extractions.append(ex)
    history = merge_reports(extractions, phone)
+    if history.get("ocr_text") and not history.get("reports"):
+        log(f"[mr-number] OCR backend: {len(history['ocr_text'])} chars of raw text "
+            f"from {len(shots)} screenshot(s) — reports extracted during rating.")
+    else:
        log(f"[mr-number] Consolidated {history['captured_count']} unique reports "
            f"(app declares {history['declared_count']}).")

@ -700,6 +767,17 @@ async def main_async(phone: str, ref: str | None, dry_run: bool, dump_ui: bool =
    log("[mr-number] Building rating profile (consolidation via batch SDK)...")
    profile = await build_rating_profile(history)
    if profile:
+        # OCR path: the rating LLM extracted the reports — fold them back so the signal
+        # record + counts reflect them (and re-run safety detection over the real reports).
+        if not history.get("reports") and isinstance(profile.get("reports"), list):
+            history["reports"] = [str(r).strip() for r in profile["reports"] if str(r).strip()]
+            history["captured_count"] = len(history["reports"])
+            history["report_count"] = max(history.get("report_count") or 0, len(history["reports"]))
+            history["safety_flags"] = detect_safety_flags(
+                history["reports"] + [ln.strip() for ln in (history.get("ocr_text") or "").splitlines() if ln.strip()],
+                history.get("red_flags") or [],
+            )
+            safety_flags = history["safety_flags"]
        result = result_from_profile(profile)
        log(f"[mr-number] Rating: {profile.get('score')}/100 grade {profile.get('grade')} "
            f"→ result '{result}'  ({profile.get('summary', '')})")
--- a/client/mr_lookup_test.py
+++ b/client/mr_lookup_test.py
@ -160,6 +160,43 @@ class TestScreeningSignalValue(unittest.TestCase):
        self.assertIsNone(mr_lookup.screening_signal_value("not_found", []))


+class TestOcrGpuBackends(unittest.IsolatedAsyncioTestCase):
+    """The on-box (Claude-API-less) path: OCR extraction + GPU OpenAI-compatible rating."""
+
+    def test_extract_json_variants(self):
+        self.assertEqual(mr_lookup._extract_json('```json\n{"a": 1}\n```'), {"a": 1})
+        self.assertEqual(mr_lookup._extract_json('blah {"score": 8} trailing'), {"score": 8})
+        self.assertIsNone(mr_lookup._extract_json("no json here"))
+
+    def test_merge_collects_ocr_and_flags_from_raw_text(self):
+        # OCR extractions carry raw_ocr (no structured reports). A LE term in the raw text
+        # must still trip the safety flag before the rating LLM structures anything.
+        ex = [{"reports": [], "red_flags": [], "raw_ocr": "User reports\nEs policía\nasks for address"}]
+        h = mr_lookup.merge_reports(ex, "+16315304426")
+        self.assertEqual(h["reports"], [])
+        self.assertIn("Es policía", h["ocr_text"])
+        self.assertTrue(any(f["category"] == "law_enforcement" for f in h["safety_flags"]))
+
+    async def test_extract_from_screenshot_uses_ocr_backend(self):
+        with patch("mr_lookup.extract_backend", return_value="ocr"), \
+             patch("mr_lookup.ocr_extract", return_value={"ok": True, "text": "raw screen text", "lines": ["raw screen text"]}) as m:
+            out = await mr_lookup._extract_from_screenshot("/tmp/s.png", "+15551234567")
+            m.assert_called_once()
+            self.assertEqual(out["raw_ocr"], "raw screen text")
+            self.assertEqual(out["reports"], [])
+
+    async def test_rating_uses_gpu_when_url_set(self):
+        gpu_json = '{"score": 8, "recommended_result": "denied", "reports": ["no-show, ghosted"], "axes": {"safety": {"score": 0}}}'
+        with patch("mr_lookup.rating_llm_url", return_value="http://10.20.0.9:8000"), \
+             patch("mr_lookup.rating_llm_model", return_value="qwen"), \
+             patch("mr_lookup.openai_chat", return_value=gpu_json) as chat:
+            prof = await mr_lookup.build_rating_profile({"ocr_text": "Es policia\nno show", "reports": []})
+            chat.assert_called_once()
+            self.assertEqual(prof["score"], 8)
+            self.assertEqual(prof["grade"], "F")          # normalized from score
+            self.assertEqual(prof["reports"], ["no-show, ghosted"])
+
+
 class TestFullFlow(unittest.IsolatedAsyncioTestCase):
    """End-to-end device path with the expensive parts mocked."""

@ -260,6 +297,44 @@ class TestFullFlow(unittest.IsolatedAsyncioTestCase):
            mock_rate.assert_not_called()
            mock_record.assert_not_called()

+    async def test_box_path_ocr_extract_plus_gpu_rating(self):
+        # The real on-box flow: OCR backend (no structured reports) → GPU rating extracts
+        # the reports; a LE term in the OCR text forces cop_flag, folded into the signal.
+        phone = "+16315304426"
+        gpu_json = ('{"score": 3, "recommended_result": "denied", '
+                    '"reports": ["Es policia", "no show after booking"], '
+                    '"axes": {"safety": {"score": 0}}, "summary": "LE-adjacent; deny."}')
+        mock_requests = MagicMock()
+        mock_post = mock_requests.post
+        mock_post.return_value.json.return_value = {"id": 7, "status": "created"}
+        mock_post.return_value.raise_for_status = MagicMock()
+
+        with patch("mr_lookup.launch_app"), \
+             patch("mr_lookup.find_and_tap_text", return_value=True), \
+             patch("mr_lookup.find_edit_text_and_input", return_value=True), \
+             patch("mr_lookup.go_to_search", return_value=True), \
+             patch("mr_lookup.open_report_detail", return_value=True), \
+             patch("mr_lookup.expand_all_reports", return_value=True), \
+             patch("mr_lookup.capture_full_history", return_value=[Path("/tmp/s0.png")]), \
+             patch("mr_lookup.extract_backend", return_value="ocr"), \
+             patch("mr_lookup.ocr_extract", return_value={"ok": True, "text": "User reports\nEs policia\nno show after booking", "lines": []}), \
+             patch("mr_lookup.rating_llm_url", return_value="http://10.20.0.9:8000"), \
+             patch("mr_lookup.rating_llm_model", return_value="qwen"), \
+             patch("mr_lookup.openai_chat", return_value=gpu_json), \
+             patch("mr_lookup.save_history", return_value=Path("/tmp/hist.json")), \
+             patch.dict("sys.modules", {"requests": mock_requests}), \
+             patch("mr_lookup.PEOPLE_SERVICE_TOKEN", "fake-token"), \
+             patch("mr_lookup.time.sleep"):
+
+            out = await mr_lookup.main_async(phone=phone, ref="prospect-9", dry_run=False)
+
+            self.assertEqual(out["result"], "denied")            # LE flag forces denied
+            body = mock_post.call_args[1].get("json", {})
+            self.assertEqual(body.get("valueText"), "cop_flag")  # LE → cop_flag verdict
+            # GPU-extracted reports folded into the recorded signal.
+            self.assertIn("no show after booking", body["valueJsonb"]["reports"])
+            self.assertTrue(any(f["category"] == "law_enforcement" for f in body["valueJsonb"]["safety_flags"]))
+

 class TestEmulatorControl(unittest.TestCase):
    """adb controller in isolation."""