| import re |
| import os |
| import dataclasses |
| from enum import auto, Enum |
| from typing import List, Tuple |
| from collections import defaultdict |
| from .constants import PART_ORDER, COCO_KEYPOINT_NAME |
|
|
| def read_hoi_file_2_dict(hoi_config): |
| hoi_dict = {} |
| with open(hoi_config, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line or line.startswith("#"): |
| continue |
| nums, obj, action = line.split() |
| hoi_dict[int(nums)] = [obj, action] |
| return hoi_dict |
|
|
| def read_part_state_file_2_dict(part_state_config): |
| d = defaultdict(list) |
| with open(part_state_config, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line or line.startswith("#"): |
| continue |
|
|
| key, val = line.split(":", 1) |
| key = key.strip() |
| val = val.strip() |
| d[key].append(val) |
| return d |
|
|
| @dataclasses.dataclass |
| class Conversation: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are an AI assistant for first-pass long-form HICO annotation. You will be given an image that contains a main human subject. |
| Task: |
| Write a detailed long description of the visual evidence in the image that supports the subject's action, with an emphasis on human body parts, posture, spatial configuration, and interactions with objects. |
| |
| Hints: |
| You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You should use these hints as anchors, and you may add other relevant visible evidence you observe. |
| |
| Required Constraints: |
| - Start with ONE sentence that summarizes the main action in natural language. |
| - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}. |
| - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases). |
| - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists. |
| - The description must be long and detailed enough to serve as a first-pass annotation for later refinement. |
| - Include as many relevant supporting details as are visibly justified, especially about contact, pose, orientation, support, and object interaction. |
| - Write your description in clear, natural sentences grounded in visible evidence. |
| |
| Optional Constraints : |
| - Prefer a rich multi-sentence paragraph rather than a short caption. |
| - Cover multiple cues when available, such as limb placement, body balance, joint bending, contact points, and relative position to the object. |
| - Write naturally. Avoid repeating the same sentence pattern. |
| - If both sides contribute differently, describe them separately. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_obj = meta['hoi_obj'] |
| |
| hoi_id = hoi_obj['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = hoi_obj['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| prompt = f""" |
| Given the image, describe the visual evidence (especially body parts) that supports the action. |
| Hints: The action to support is [{act_in_word} with {obj_in_word}]. Possible visual evidence cues include: {action_labels_in_words}. |
| Use these cues as guidance. Only mention cues you can actually see in the image. |
| """ |
| return prompt |
| |
| @dataclasses.dataclass |
| class Conversation_For_Clean_Descrption: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are a strict verifier and editor for pose-grounded action descriptions. |
| |
| You will be given: |
| - Ground-truth action label(s) (GT). |
| - A candidate description (may be verbose or include irrelevant evidence). |
| - A closed list of allowed keypoint/body-part names. |
| - A replacement mapping (e.g., hand→wrist, foot→ankle). |
| |
| Rules: |
| 1) First, check whether the candidate’s stated action matches the GT action(s). |
| 2) Then rewrite the description into exactly 2–3 sentences: |
| - The first sentence must state the GT action (not the candidate action if it differs). |
| - Keep only evidence that supports the GT action; delete unrelated evidence. |
| - If a joint is mentioned both sides ALWAYS write as "left_wrist and right_wrist", "left_hip and right_hip", "left_ankle and right_ankle", etc. |
| - When mentioning body parts/keypoints, you MUST use only names from the allowed list (exact match). |
| - Apply the replacement mapping strictly; never output disallowed synonyms like “hand/foot” if they map to allowed names. |
| - Do not add new evidence; only keep/condense evidence already present in the candidate. |
| - A MUST-KEEP hint: required (joint, part_action) items that must appear in the final description (joint names may need replacement). |
| |
| Output format (plain text only): The refined 2–3 sentence description. |
| No other text. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| description = meta['description'] |
| description = self._replace_part_names(description) |
|
|
| prompt = f""" |
| GT action(s): {act_in_word, obj_in_word} |
| Allowed keypoint names: |
| {COCO_KEYPOINT_NAME} |
| Replacement mapping: |
| "hand" to "wrist", "foot" to "ankle" |
| Candidate description: |
| {description} |
| Must-KEEP Hint: |
| {action_labels_in_words} |
| Please follow the system rules and output in the required plain-text format. |
| """ |
| return prompt |
|
|
| @dataclasses.dataclass |
| class Conversation_For_Clean_Evidence: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You rewrite descriptions into NATURAL LANGUAGE evidence-only text. |
| |
| Output rules: |
| - Write 2–3 complete sentences in natural English. |
| - Do NOT mention the action or the subject (no "person", "he", "she", "they", "main", etc.). |
| - Only describe evidence involving body parts/keypoints and part-level motions/contacts. |
| - Every sentence must include at least one keypoint name from the allowed list (exact match). |
| - Only use keypoint names from the allowed list; no other body-part words. |
| - Never use generic joints (e.g., "wrist", "hip", "ankle") alone; If both sides are mentioned, use "left_wrist and right_wrist", "left_hip and right_hip", etc. |
| - Apply the replacement mapping first (hand→wrist, foot→ankle, etc.), then enforce left/right by writing both sides. |
| - Keep only evidence supported by the candidate; do not add new details. |
| |
| Style variety requirement: |
| - Write like a reasoning use normal grammar, not lists, not "keypoint: ...". |
| - Do not use the same starter phrase or the same connector in both sentences. |
| - Example reasoning patterns (Can invent your own, but use different pattern): |
| A) “With <keypoints/evidence>, <interpretation>.” (no “suggesting/indicating”) |
| B) “<Interpretation>; evidence includes <keypoints/evidence>.” (semicolon style) |
| C) “This is supported by <keypoints/evidence>, which <effect/constraint>.” (“supported by” style) |
| D) “Notably, <keypoints/evidence>; this points to <interpretation>.” (“notably/points to” style) |
| E) “<Keypoints/evidence> form(s) <configuration>, consistent with <interpretation>.” (“configuration” style) |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| description = meta['short_description'] |
| description = self._replace_part_names(description) |
| prompt = f""" |
| GT action(s): {act_in_word, obj_in_word} |
| Allowed keypoint names: |
| {COCO_KEYPOINT_NAME} |
| Replacement mapping: |
| "hand" to "wrist", "foot" to "ankle" |
| Candidate description: |
| {description} |
| Must-KEEP Hint: |
| {action_labels_in_words} |
| Please follow the system rules and output in the required plain-text format. |
| """ |
| return prompt |
|
|
| @dataclasses.dataclass |
| class Conversation_examiner: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are a strict checker and final editor for HICO action annotations. |
| |
| You will be given: |
| - The ground-truth HICO action hint as [VERB, OBJECT]. |
| - Part-state hints derived from annotation labels. |
| - One or more candidate texts, such as a long description, a short description, or an evidence-only description. |
| |
| Your task: |
| - Judge whether the candidate texts are consistent with the target action. |
| - Check whether the descriptions are grounded in plausible visible body-part evidence. |
| - Check whether any mentioned body parts use valid COCO keypoint names only: {COCO_KEYPOINT_NAME}. |
| - Detect unsupported claims, contradictions, object/action mismatches, left/right mistakes, and hallucinated joints or interactions. |
| - Produce a final checked description after resolving any issues you can fix from the provided candidates and hints. |
| |
| Important checking rules: |
| - The target action is defined by the provided HICO hint, not by the candidate text. |
| - If a candidate text conflicts with the target action, fix the final checked description so it aligns with the target action. |
| - If a candidate text includes body-part terms outside the allowed keypoint list, replace them with valid names when possible and record the issue. |
| - If evidence is too vague, missing, or unrelated to the target action, remove unsupported content from the final checked description and record the issue. |
| - Pay special attention to left/right consistency. If the candidate confuses left and right, or assigns evidence to the wrong side, correct it when the correct side is supported by the provided candidates and hints; otherwise remove the uncertain side-specific claim and record the issue. |
| - Do not keep any joint claim that is not visible, not inferable from the provided evidence, or appears hallucinated. If a joint or body-part interaction cannot be supported, remove it and record the issue. |
| - Do not invent new visual evidence that is not supported by the provided candidates and hints. |
| - The final checked description should be concise, natural, and reliable. |
| - Prefer the strongest grounded evidence among the provided candidates. |
| - When side-specific evidence is uncertain, prefer a conservative description over a risky one. |
| |
| Output format: |
| Return plain text in exactly this structure. |
| |
| Verdict: PASS or REVISED |
| Action alignment: one short sentence |
| Evidence grounding: one short sentence |
| Keypoint-name validity: one short sentence |
| Checked description: |
| <final checked description> |
| Issues: |
| - item 1 |
| - item 2 |
| |
| If there are no issues, write: |
| Issues: |
| - None |
| """ |
| else: |
| self.system = system |
|
|
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
|
|
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| long_description = self._replace_part_names(meta.get('description', '')) |
| refined_description = self._replace_part_names(meta.get('refined_description', '')) |
| short_description = self._replace_part_names(meta.get('short_description', '')) |
| action_description = self._replace_part_names(meta.get('action_description', '')) |
| evidence_description = self._replace_part_names(meta.get('evidence_description', '')) |
|
|
| prompt = f""" |
| Target action hint: [{act_in_word}, {obj_in_word}] |
| Part-state hints: |
| {action_labels_in_words} |
| |
| Candidate long description: |
| {long_description if long_description else '[Missing]'} |
| |
| Candidate refined description: |
| {refined_description if refined_description else '[Missing]'} |
| |
| Candidate short description: |
| {short_description if short_description else '[Missing]'} |
| |
| Candidate action description: |
| {action_description if action_description else '[Missing]'} |
| |
| Candidate evidence description: |
| {evidence_description if evidence_description else '[Missing]'} |
| |
| Check the candidates against the target action and part-state hints, produce the final checked description, and then follow the required output format exactly. |
| """ |
| return prompt |
|
|
| @dataclasses.dataclass |
| class Conversation_For_Action_Pharse: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are a visual captioning assistant. |
| Given an image and an action hint in the form [VERB, OBJECT], output exactly one short English sentence describing that action in the image. |
| |
| Rules: |
| • Use only the provided VERB and OBJECT (you may adjust grammar: holds/holding; a/the; plural if needed). |
| • Output one sentence only. |
| • No extra details (no location, colors, emotions, reasons, scene context). |
| • No punctuation beyond the final period. |
| • If the subject is a person, use “The person” (not “man/woman/boy/girl”). |
| • If the action is not visible, still output a best-effort sentence using the hint. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| description = meta['short_description'] |
| description = self._replace_part_names(description) |
| prompt = f""" |
| Hints: {act_in_word, obj_in_word} |
| Write exactly one short sentence that follows the rules. |
| """ |
| return prompt |
| |
| @dataclasses.dataclass |
| class Conversation_For_COCO_Long_Description: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are an AI assistant. You will be given an image that contains a main human subject. |
| Task: |
| Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects. |
| |
| Hints: |
| You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe. |
| |
| Required Constraints: |
| - Start with ONE sentence that summarizes the main action in natural language. |
| - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}. |
| - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases). |
| - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists. |
| - Write your description in clear, concise sentences grounded in visible evidence. |
| |
| Optional Constraints : |
| - Write naturally. Avoid repeating the same sentence pattern. |
| - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def get_prompt(self, meta): |
| |
| prompt = f""" |
| Hint: you may consider use the actions in the below dictionary {self.part_state_reference} |
| Given the image, describe the visual evidence (especially body parts) that supports the action. |
| """ |
| return prompt |
|
|
|
|
| if __name__ == "__main__": |
| pass |
|
|