import json

def generate_manifest(input_json="pre_annotations_sentences.json", output_manifest="trimming_manifest.jsonl"):
    with open(input_json, "r", encoding="utf-8") as f:
        tasks = json.load(f)
    
    with open(output_manifest, "w", encoding="utf-8") as out:
        for task in tasks:
            data = task["data"]
            
            # Extract the audio region to get sentence timestamps relative to trimmed audio
            audio_region = None
            text_region = None
            for result in task.get("predictions", [{}])[0].get("result", []):
                if result.get("to_name") == "audio":
                    audio_region = result
                elif result.get("to_name") == "full_text":
                    text_region = result
            
            trim_start = data.get("original_trim_start", 0)
            trim_end = data.get("original_trim_end", 0)
            
            # Recover original timestamps from the shifted ones
            sentence_start_in_trimmed = audio_region["value"]["start"] if audio_region else 0
            sentence_end_in_trimmed = audio_region["value"]["end"] if audio_region else 0
            original_sentence_start = trim_start + sentence_start_in_trimmed
            original_sentence_end = trim_start + sentence_end_in_trimmed
            
            # Extract current sentence text from text_region
            current_sentence = text_region["value"]["text"] if text_region else ""
            
            # Convert URL back to actual file path on disk
            audio_url = data.get("audio", "")
            # Strip http://localhost:8081/ prefix and URL-decode
            audio_path = audio_url.replace("http://localhost:8081/", "/root/")
            from urllib.parse import unquote
            audio_path = unquote(audio_path)
            
            entry = {
                "trimmed_audio_filepath": audio_path,
                "trim_start_sec": round(trim_start, 2),
                "trim_end_sec": round(trim_end, 2),
                "trim_duration_sec": round(trim_end - trim_start, 2),
                "original_sentence_start_sec": round(original_sentence_start, 2),
                "original_sentence_end_sec": round(original_sentence_end, 2),
                "current_sentence": current_sentence,
                "full_text": data.get("full_text", ""),
            }
            
            out.write(json.dumps(entry, ensure_ascii=False) + "\n")
    
    print(f"Generated {output_manifest} with {len(tasks)} entries")

if __name__ == "__main__":
    generate_manifest()
