import json
import uuid
from collections import defaultdict

def process_manifest(manifest_path):
    # Group by source audio
    audio_data = defaultdict(lambda: {"sentences": [], "audio_url": ""})
    
    with open(manifest_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line.strip())
            source_audio = item["source_audio_filepath"]
            
            # Strip /root/ prefix since the server is running from /root
            relative_path = source_audio
            if relative_path.startswith("/root/"):
                relative_path = relative_path[len("/root/"):]
                
            audio_data[source_audio]["audio_url"] = f"http://localhost:8081/{relative_path}"
            
            sentence_info = {
                "text": item["text"],
                "start_time": item["offset"],
                "end_time": item["offset"] + item.get("duration", 10.0)
            }
            audio_data[source_audio]["sentences"].append(sentence_info)
            
    all_tasks = []
    
    for source_audio, data in audio_data.items():
        # Sort sentences by start time
        sentences = sorted(data["sentences"], key=lambda x: x["start_time"])
        audio_url = data["audio_url"]
        
        for i, sentence in enumerate(sentences):
            prev_chunk = sentences[i-1]["text"] if i > 0 else ""
            curr_chunk = sentence["text"]
            next_chunk = sentences[i+1]["text"] if i < len(sentences) - 1 else ""
            
            audio_id = f"audio_{uuid.uuid4().hex[:8]}"
            text_id = f"text_{uuid.uuid4().hex[:8]}"
            
            audio_region = {
                "id": audio_id,
                "from_name": "labels",      
                "to_name": "audio",         
                "type": "labels",
                "value": {
                    "start": sentence["start_time"],
                    "end": sentence["end_time"],
                    "labels": ["Sentence"]
                }
            }
            
            text_region = {
                "id": text_id,
                "from_name": "text_labels", 
                "to_name": "current_chunk",    
                "type": "labels",
                "value": {
                    "start": 0,
                    "end": len(curr_chunk),
                    "text": curr_chunk,
                    "labels": ["Sentence"]
                }
            }
            
            task = {
                "data": {
                    "audio": audio_url,
                    "prev_chunk": prev_chunk,
                    "current_chunk": curr_chunk,
                    "next_chunk": next_chunk
                },
                "predictions": [
                    {
                        "model_version": "auto_aligner_v1",
                        "result": [audio_region, text_region]
                    }
                ]
            }
            all_tasks.append(task)
            
    return all_tasks

if __name__ == "__main__":
    manifest_file = "/root/aivanta_chunks/manifest_final.jsonl"
    ls_tasks = process_manifest(manifest_file)
    
    output_file = "pre_annotations_sentences.json"
    with open(output_file, "w") as f:
        json.dump(ls_tasks, f, indent=2)
    print(f"Successfully generated {output_file} with {len(ls_tasks)} tasks")