import re  # Regular expression module / 正規表現モジュール
import sys  # Command-line argument module / コマンドライン引数の取得に使用

def extract_text_without_speaker(file_path, output_path="output.txt"):
    # Open the VTT file / VTTファイルを開く
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    text_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines / 空行をスキップ
        if "-->" in line or line.lower().startswith("webvtt"):
            continue  # Skip timestamp or header lines / タイムコードやヘッダー行をスキップ

        # Remove speaker names like "Fab Classes:" or "Fab クラス:"
        # 「Fab Classes:」「Fab クラス:」のような話者名を削除
        line = re.sub(r'^(Fab\s?(クラス|Classes)[：:]\s?)', '', line)
        text_lines.append(line)

    # Save the extracted text to a file / 抽出したテキストをファイルに保存
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n".join(text_lines))

# --- How to run from command line / コマンドラインからの実行方法 ---
if __name__ == "__main__":
    # Check if input file was provided / 入力ファイル名が指定されているか確認
    if len(sys.argv) < 2:
        print("Usage: python extract_vtt_dialogue_args.py input_file.vtt [output_file.txt]")
        print("使い方: python extract_vtt_dialogue_args.py 入力ファイル.vtt [出力ファイル.txt]")
        sys.exit(1)

    input_file = sys.argv[1]  # First argument: input file / 最初の引数：入力ファイル名
    output_file = sys.argv[2] if len(sys.argv) >= 3 else "output.txt"  # Second optional argument / 2番目の引数（省略可）

    extract_text_without_speaker(input_file, output_file)
    print(f"✅ Done! Text saved as '{output_file}'")  # 完了メッセージ