306 lines
19 KiB
Python
306 lines
19 KiB
Python
import json
|
|
from typing import Dict
|
|
|
|
|
|
THREAT_TYPES = (
|
|
"intrusion / tailgating / credential_theft / fire_risk / unattended_cooking / "
|
|
"carbon_monoxide / sensor_stuck / sensor_drift / sensor_malfunction / actuator_stuck / "
|
|
"lock_malfunction / safety_device_failure / water_leak / possible_fall / "
|
|
"abnormal_inactivity / health_concern / child_safety / behavioral_anomaly / none"
|
|
)
|
|
|
|
|
|
def _protocol_notes_block(material: Dict) -> str:
|
|
notes = material.get("protocol_notes", [])
|
|
return "\n".join(f"- {note}" for note in notes)
|
|
|
|
|
|
def build_triage_prompt(material: Dict) -> Dict[str, str]:
|
|
system = (
|
|
"You are the triage coordinator for smart-home log analysis. "
|
|
"Infer the task from the query without hidden benchmark labels. "
|
|
"Anchor to the query target first, then choose a small number of chunks for inspection."
|
|
)
|
|
user = f"""## Query
|
|
{material['query']}
|
|
|
|
## Matter Notes
|
|
{_protocol_notes_block(material)}
|
|
|
|
## Layout
|
|
{material['layout_summary']}
|
|
|
|
## Deterministic Signals
|
|
{json.dumps(material['signals'], ensure_ascii=False, indent=2)}
|
|
|
|
## Chunk Index
|
|
{json.dumps(material['chunk_index'], ensure_ascii=False, indent=2)}
|
|
|
|
Rules:
|
|
- Choose exactly one `primary_task_profile`.
|
|
- Use `secondary_task_profile` only if it materially helps.
|
|
- If the query asks whether a specific device/room works normally, prefer `device-health`.
|
|
- If the query asks for an action or response plan, prefer `emergency-response`.
|
|
- If the query asks whether a recent event is a security threat, prefer `single-event-safety`, not `device-health`.
|
|
- If the query asks whether there is an abnormal behavior pattern over many hours, prefer `behavior-sequence`, not `device-health`.
|
|
- If the query asks for the home's current safety status or potential risks, prefer `composite-safety`, not `device-health`.
|
|
- Select only 1-4 `focus_chunk_ids` in round 1.
|
|
- If `primary_task_profile=device-health`, prioritize chunks that show:
|
|
- the suspected abnormal event,
|
|
- the immediate follow-up/retry sequence,
|
|
- and any later recovery or stabilization evidence.
|
|
- For `device-health`, avoid a single-chunk verdict when the case involves locks, contacts, sensors, or a claimed transient malfunction. Prefer at least:
|
|
- one chunk before or at the suspicious event,
|
|
- and one chunk after it that can confirm recovery, failed recovery, or missing follow-through.
|
|
- For locks and access devices, include both the command sequence and the later state outcome. Do not conclude health from one locally coherent unlock/lock snippet alone.
|
|
- For `composite-safety` or `emergency-response`, do not narrow the case to a single suspicious device too early. Prefer chunk selection that covers:
|
|
- the possible hazard trigger,
|
|
- nearby human/activity context,
|
|
- and any recovery or consequence evidence.
|
|
- For `single-event-safety` or `behavior-sequence`, prioritize chunks that show:
|
|
- the access path or trigger event,
|
|
- nearby human/device sequence context,
|
|
- and whether the sequence is explained by an ordinary routine.
|
|
- Missing logs for a device are not themselves evidence of failure.
|
|
- For non-`device-health` queries, do not let a transient `None`, temperature spike, or sparse telemetry become the main task framing unless the query itself is about device failure.
|
|
- For `composite-safety`, `behavior-sequence`, or `emergency-response`, use `secondary_task_profile=device-health` only when the case explicitly contains direct device-fault evidence such as fault events, repeated command retries, persistent no-response, or safety-device failure indicators.
|
|
|
|
Return JSON only:
|
|
{{
|
|
"primary_task_profile": "device-health | single-event-safety | behavior-sequence | composite-safety | emergency-response",
|
|
"secondary_task_profile": "none | device-health | single-event-safety | behavior-sequence | composite-safety | emergency-response",
|
|
"query_anchor": {{
|
|
"target_rooms": ["..."],
|
|
"target_devices": ["..."],
|
|
"target_question": "..."
|
|
}},
|
|
"focus_rooms": ["..."],
|
|
"focus_devices": ["..."],
|
|
"focus_chunk_ids": ["C00", "C01"],
|
|
"suspected_patterns": ["..."],
|
|
"why_these_chunks": ["..."]
|
|
}}"""
|
|
return {"system": system, "user": user}
|
|
|
|
|
|
def build_investigator_prompt(
|
|
material: Dict,
|
|
triage_text: str,
|
|
focused_chunks: str,
|
|
supervisor_text: str = "",
|
|
round_index: int = 1,
|
|
) -> Dict[str, str]:
|
|
system = (
|
|
"You are the investigator. Work only from the query, Matter notes, structured signals, and focused raw chunks. "
|
|
"Construct competing normal and anomaly hypotheses with explicit evidence. "
|
|
"Distinguish device faults from behavior or safety anomalies carefully."
|
|
)
|
|
supervisor_block = ""
|
|
if supervisor_text:
|
|
supervisor_block = f"""
|
|
## Supervisor Feedback
|
|
{supervisor_text}
|
|
|
|
Apply the supervisor feedback explicitly in this round.
|
|
"""
|
|
user = f"""## Query
|
|
{material['query']}
|
|
|
|
## Matter Notes
|
|
{_protocol_notes_block(material)}
|
|
|
|
## Structured Signals
|
|
{json.dumps(material['signals'], ensure_ascii=False, indent=2)}
|
|
|
|
## Triage Output
|
|
{triage_text}
|
|
{supervisor_block}
|
|
## Focused Chunks
|
|
{focused_chunks}
|
|
|
|
Round: {round_index}
|
|
|
|
Rules:
|
|
- A raw value like `2466` for `TemperatureMeasurement.MeasuredValue` means `24.66 C`, not `2466 C`.
|
|
- Do not infer `sensor_malfunction` or `sensor_drift` from scaled temperature values alone.
|
|
- Device-fault hypotheses require direct evidence such as explicit alarm/fault events, repeated non-recovery, stuck values, impossible state transitions, or actuator commands failing to take effect.
|
|
- Behavior, intrusion, safety, and emergency hypotheses may be supported by coherent temporal patterns, cross-device inconsistencies, absence where an event should appear, or risky multi-step sequences even when no explicit fault code exists.
|
|
- If the query asks for abnormal behavior or safety risk, do not discard the anomaly path just because the system eventually recovered.
|
|
- If `primary_task_profile` is not `device-health`, treat telemetry issues (`None`, short dropouts, one isolated spike) as side evidence unless they directly explain the queried behavior/safety event.
|
|
- Keep the normal hypothesis competitive, but also keep at least one anomaly hypothesis whenever the logs contain a plausible risk pattern that still needs explanation.
|
|
- For `device-health`, eventual recovery does not erase an anomaly if the logs show repeated retries, alarm/fault events, contradictory state transitions, failed first attempts, or unstable actuator behavior.
|
|
- For `device-health`, an isolated suspicious reading that immediately returns to baseline is not enough for `sensor_malfunction` unless there is repetition, corroboration, failed recovery, or direct fault evidence.
|
|
- For lock/contact health checks, require explicit failed lock commands, contradictory lock/contact states, repeated retries, or persistent insecure state. Do not infer `lock_malfunction` only from delayed auto-lock, a long unlocked interval, or sparse mid-day access logs.
|
|
- A single transient `None`, one brief telemetry dropout, or the mere absence of logs for a device is not enough to claim `sensor_malfunction`, `safety_device_failure`, or a monitoring blind spot.
|
|
- For `sensor_malfunction` / `sensor_stuck` / `sensor_drift`, require persistence, repetition, failed recovery, or direct contradiction with other signals.
|
|
- For `unattended_cooking` / `fire_risk`, require not only heat/cook activity but also evidence of missing supervision, dangerous duration, failed mitigation, or hazardous escalation.
|
|
- When `primary_task_profile` is `composite-safety` or `emergency-response`, do not rely only on missing kitchen occupancy, the occupant being logged in another room, or the absence of a smoke alarm for `unattended_cooking`. Prefer a concrete unsupervised sequence such as:
|
|
- people leaving the hazard area with no return for a meaningful interval,
|
|
- vulnerable subject context with no mitigating check-back,
|
|
- or a hazardous device left active through a clear outcome window.
|
|
- When `primary_task_profile` is `composite-safety` or `emergency-response`, temperature rise or one telemetry gap is insufficient by itself for `fire_risk`. Prefer corroboration such as sustained heat growth, alarm-related evidence, failed mitigation, or a clearly unsafe appliance sequence.
|
|
- For `intrusion` / `tailgating` / `credential_theft`, occupancy alone is insufficient. Prefer corroboration from access-control inconsistencies, motion progression, lock/contact conflicts, or impossible entry timing.
|
|
- Do not explain away a suspected intrusion / credential-theft / child-safety / behavior anomaly as "normal cooking" or "telemetry noise" unless that explanation directly accounts for the queried abnormal sequence.
|
|
- For `child_safety`, `possible_fall`, and `abnormal_inactivity`, require subject-specific risky context, not just generic household quietness or sparse logs.
|
|
- For `composite-safety` or `emergency-response`, explicitly check whether your story is overly dependent on a single device or a single chunk. If yes, weaken the anomaly claim.
|
|
- For `composite-safety`, a local hazard can still justify an anomaly if all three are present:
|
|
- a clear hazard source or risky sequence,
|
|
- human vulnerability / missing supervision / dangerous context,
|
|
- and weak or delayed mitigation.
|
|
- For `composite-safety`, do not collapse to `none` merely because not every device category was observed, if a concrete hazardous sequence is already evidenced.
|
|
- For `composite-safety`, if a cook/heat/leak/fall hazard is suspected, make sure your evidence covers:
|
|
- the trigger chunk,
|
|
- one adjacent context chunk,
|
|
- and one mitigation/outcome chunk before concluding normal.
|
|
|
|
Return JSON only:
|
|
{{
|
|
"normal_hypotheses": [
|
|
{{"id": "N1", "description": "...", "evidence": ["..."], "weaknesses": ["..."]}}
|
|
],
|
|
"anomaly_hypotheses": [
|
|
{{"id": "A1", "description": "...", "threat_type": "{THREAT_TYPES}", "evidence": ["..."], "weaknesses": ["..."]}}
|
|
],
|
|
"most_discriminative_evidence": ["..."],
|
|
"missing_information": ["..."]
|
|
}}"""
|
|
return {"system": system, "user": user}
|
|
|
|
|
|
def build_supervisor_prompt(
|
|
material: Dict,
|
|
triage_text: str,
|
|
investigator_text: str,
|
|
focused_chunks: str,
|
|
round_index: int = 1,
|
|
) -> Dict[str, str]:
|
|
system = (
|
|
"You are the supervisor. Check whether the current evidence collection is on-topic and sufficient. "
|
|
"Flag protocol-format misunderstandings, false-alarm risk, and missing checks."
|
|
)
|
|
user = f"""## Query
|
|
{material['query']}
|
|
|
|
## Matter Notes
|
|
{_protocol_notes_block(material)}
|
|
|
|
## Triage Output
|
|
{triage_text}
|
|
|
|
## Investigator Output
|
|
{investigator_text}
|
|
|
|
## Focused Chunks
|
|
{focused_chunks}
|
|
|
|
## Available Chunk IDs
|
|
{json.dumps([chunk['chunk_id'] for chunk in material['chunk_index']], ensure_ascii=False)}
|
|
|
|
Round: {round_index}
|
|
|
|
Rules:
|
|
- If the investigator seems to misread Matter-scaled values as literal impossible temperatures, mark false-alarm risk high.
|
|
- If the analysis drifted away from the query target, mark `on_topic=false`.
|
|
- Distinguish two standards of evidence:
|
|
- device-fault labels need direct fault evidence;
|
|
- behavior/safety/emergency labels may rely on coherent temporal and cross-device evidence.
|
|
- If the query is about recent security events, abnormal behavior, or whole-home safety, and the investigation mostly debates telemetry quality, treat that as partial task drift.
|
|
- If the evidence is still ambiguous after this round, do not automatically convert that into a normal verdict.
|
|
- Use `recommended_action=refine_investigation` when another round could realistically help.
|
|
- Use `recommended_action=abstain` only when the current anomaly story is weak, underspecified, or mostly speculative after review.
|
|
- For `device-health`, eventual recovery does not automatically clear the case if the observed sequence already contains direct malfunction evidence such as repeated retries, contradictory states, or fault/alarm events.
|
|
- If a `device-health` conclusion is being made from one local chunk or one short nominal sequence only, prefer `refine_investigation` and request adjacent pre/post chunks.
|
|
- Mark false-alarm risk `high` if the anomaly story depends mainly on:
|
|
- one transient `None` or brief data dropout,
|
|
- missing logs from a device,
|
|
- one isolated suspicious reading without consequence,
|
|
- or a broad safety conclusion built from a single local device issue.
|
|
- Mark false-alarm risk `high` for `sensor_malfunction` if the story rests on one transient spike that immediately returns to baseline without repetition, alarms, or downstream consequences.
|
|
- Mark false-alarm risk `high` for `lock_malfunction` if the story rests mainly on delayed auto-lock, a long unlocked interval, or unobserved mid-gap access without an explicit failed lock attempt or contradictory final state.
|
|
- For `composite-safety` or `emergency-response`, mark false-alarm risk `high` for `unattended_cooking` / `fire_risk` if the story rests mainly on missing kitchen occupancy, another room showing occupancy, missing OFF logs near the truncation boundary, or one transient telemetry gap without hazardous escalation.
|
|
- For `composite-safety` and `emergency-response`, only mark `evidence_sufficient=true` if the analysis covers both the local trigger and the wider human/safety context.
|
|
- For `composite-safety`, if the current focused chunks miss the trigger chunk, the adjacent context chunk, or the mitigation/outcome chunk, request those chunks explicitly in `needs_more_chunks`.
|
|
- For `behavior-sequence` and `single-event-safety`, if the current story does not explain the access path / sequence logic directly, request the surrounding sequence chunks instead of debating device telemetry.
|
|
- If the conclusion is effectively "a device was not logged, therefore the home is unsafe", treat that as weak evidence unless there is corroboration.
|
|
- For `composite-safety`, if a clear hazardous sequence already includes a local trigger, vulnerable/unsupervised context, and missing or delayed mitigation, that local sequence may be sufficient even if some peripheral devices are not discussed.
|
|
|
|
Return JSON only:
|
|
{{
|
|
"on_topic": true/false,
|
|
"evidence_sufficient": true/false,
|
|
"risk_of_false_alarm": "low | medium | high",
|
|
"recommended_action": "allow_final_verdict | refine_investigation | abstain",
|
|
"needs_more_chunks": ["C03", "C05"],
|
|
"missing_checks": ["..."],
|
|
"supervisor_notes": ["..."]
|
|
}}"""
|
|
return {"system": system, "user": user}
|
|
|
|
|
|
def build_verifier_prompt(
|
|
material: Dict,
|
|
triage_text: str,
|
|
investigator_text: str,
|
|
supervisor_text: str,
|
|
focused_chunks: str,
|
|
) -> Dict[str, str]:
|
|
system = (
|
|
"You are the final verifier. Make a precise final decision from the query, Matter notes, evidence, and competing hypotheses. "
|
|
"You must obey the supervisor gate."
|
|
)
|
|
user = f"""## Query
|
|
{material['query']}
|
|
|
|
## Matter Notes
|
|
{_protocol_notes_block(material)}
|
|
|
|
## Triage
|
|
{triage_text}
|
|
|
|
## Investigator
|
|
{investigator_text}
|
|
|
|
## Supervisor
|
|
{supervisor_text}
|
|
|
|
## Focused Chunks
|
|
{focused_chunks}
|
|
|
|
Rules:
|
|
- If the supervisor recommends `abstain`, you must return `is_anomaly=false`, `threat_type=none`, and `confidence=low`.
|
|
- Do not equate `evidence_sufficient=false` with `no anomaly`. It means the case may still require a cautious low/medium-confidence decision.
|
|
- Device-fault labels require direct device-fault evidence; scaled Matter temperature values alone are insufficient.
|
|
- Behavior, intrusion, safety, and emergency anomalies may be concluded from coherent temporal, causal, or cross-device evidence even when no explicit fault code exists.
|
|
- If the anomaly pattern is strong but type selection is uncertain, prefer the best-supported non-device-fault label over collapsing to `none`.
|
|
- Do not dismiss an anomaly just because the system later recovered, if the logged sequence itself reflects a real unsafe or abnormal event.
|
|
- If the investigator presents a plausible anomaly hypothesis and the supervisor does not abstain, you must explicitly refute that anomaly in your reasoning before returning `none`.
|
|
- If the query is non-`device-health` and the main anomaly story directly answers the query, prefer a low/medium-confidence anomaly over `none` when the remaining uncertainty is about telemetry quality rather than about the event sequence itself.
|
|
- For `composite-safety` or `emergency-response`, do not use that rule when the anomaly story itself is weakly supported or depends mainly on absence-based supervision assumptions, sparse lock activity, or one ambiguous telemetry inconsistency.
|
|
- Use `none` when the evidence supports a normal explanation better than an anomaly explanation.
|
|
- For `device-health`, repeated retries, contradictory lock/contact transitions, explicit alarm events, or unstable actuator sequences can justify anomaly even if the device later recovers.
|
|
- Never escalate to `sensor_malfunction`, `safety_device_failure`, or `sensor_stuck` from a single transient dropout or from missing logs alone.
|
|
- Never escalate to `sensor_malfunction` from one isolated spike that immediately returns to baseline unless you also have repetition, corroborating device conflict, failed recovery, or explicit fault evidence.
|
|
- Never escalate to `lock_malfunction` from delayed auto-lock, a long unlocked interval, or sparse access activity alone. Require failed lock commands, repeated retries, contradictory lock/contact states, or persistent insecure outcome.
|
|
- For `device-health`, if the inspected evidence does not cover both the suspicious event and a later confirming outcome, lower confidence and prefer additional review over a strong final verdict.
|
|
- For `single-event-safety`, `behavior-sequence`, and `composite-safety`, do not return `none` solely because some telemetry is incomplete if the human/device sequence already supports a concrete anomaly better than a normal routine.
|
|
- For `composite-safety` or `emergency-response`, do return `none` when the anomaly depends mainly on missing supervision assumptions, sparse access logs, or a plausible ordinary routine that explains the same sequence without contradiction.
|
|
- For `composite-safety` and `emergency-response`, check scope before finalizing:
|
|
- Did you assess the broader human/safety context?
|
|
- Or did you overfit to one device/chunk?
|
|
If you overfit to one local signal, lower confidence or reject the anomaly claim.
|
|
- For `unattended_cooking` / `fire_risk`, require a hazardous sequence, not merely cooking plus incomplete telemetry.
|
|
- For `intrusion` / `tailgating`, require access-path evidence, not just unusual occupancy or periodic motion.
|
|
- For `composite-safety`, if the logs show a concrete hazardous sequence with a trigger, vulnerable or unsupervised context, and delayed/weak mitigation, you may conclude anomaly even if the full-home picture is incomplete.
|
|
|
|
Return JSON only:
|
|
{{
|
|
"is_anomaly": true/false,
|
|
"confidence": "high/medium/low",
|
|
"threat_type": "{THREAT_TYPES}",
|
|
"threat_description": "one-sentence conclusion",
|
|
"reasoning": ["step 1", "step 2", "step 3"],
|
|
"key_evidence": ["evidence 1", "evidence 2"],
|
|
"recommended_actions": ["action 1", "action 2"]
|
|
}}"""
|
|
return {"system": system, "user": user}
|