{"claim": "Node Fir-10 allocated 1 GPU to Project Sonata on 2026-04-15.", "evidence": [], "id": "train_000001", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Leo Hale was assigned as the data steward on 2026-04-05 had a run with Lumen-7B on Node Spruce-03 that failed because of a checkpoint-mismatch error on 2026-04-11.", "evidence": [{"doc_id": "doc_000012", "sent_id": 4}, {"doc_id": "doc_000146", "sent_id": 6}], "id": "train_000002", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-07-26 on 2026-06-27.", "evidence": [], "id": "train_000003", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "River-8B achieved 0.689 accuracy on SignalSet-2 for Project Anchor on 2026-05-27.", "evidence": [], "id": "train_000004", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Helix-3B achieved a higher latency efficiency score than Vela-3B.", "evidence": [{"doc_id": "doc_000484", "sent_id": 2}, {"doc_id": "doc_000466", "sent_id": 7}], "id": "train_000005", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron had a failed run with Aster-3B on Node Maple-01 because of a missing-index error on 2026-04-02.", "evidence": [{"doc_id": "doc_000242", "sent_id": 3}], "id": "train_000006", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lena Costa was assigned as the data steward for Project Saffron on 2026-05-11.", "evidence": [{"doc_id": "doc_000078", "sent_id": 2}], "id": "train_000007", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Mira-7B for calibration on 2026-06-15.", "evidence": [], "id": "train_000008", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Orchid-8B achieved 0.845 accuracy on LumenFacts for Project Anchor on 2026-05-27.", "evidence": [], "id": "train_000009", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Cedar-7B failed on Node Laurel-11 because of an unstable-validation-loss error on 2026-05-29.", "evidence": [{"doc_id": "doc_000321", "sent_id": 8}], "id": "train_000010", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-3B for claim classification on 2026-05-26.", "evidence": [], "id": "train_000011", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-05-04 moved the Milestone H1 deadline to 2026-05-30 on 2026-05-14.", "evidence": [{"doc_id": "doc_000023", "sent_id": 2}, {"doc_id": "doc_000090", "sent_id": 3}], "id": "train_000012", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Fir-10 allocated 2 GPUs to Project Saffron on 2026-06-30.", "evidence": [], "id": "train_000013", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-14 had a failed run with Aster-8B on Node Willow-05 because of an unstable-validation-loss error on 2026-06-20.", "evidence": [{"doc_id": "doc_000216", "sent_id": 4}, {"doc_id": "doc_000436", "sent_id": 6}], "id": "train_000014", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-26 changed its method from hybrid retrieval to QLoRA adaptation on 2026-06-28.", "evidence": [{"doc_id": "doc_000090", "sent_id": 7}, {"doc_id": "doc_000415", "sent_id": 4}], "id": "train_000015", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Nimbus-8B failed because of a checkpoint-mismatch error on 2026-06-19 while using Node Rowan-09.", "evidence": [{"doc_id": "doc_000334", "sent_id": 8}], "id": "train_000016", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-05-05 received 6 GPUs from Node Poplar-12 on 2026-06-27.", "evidence": [{"doc_id": "doc_000004", "sent_id": 9}, {"doc_id": "doc_000125", "sent_id": 6}], "id": "train_000017", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from dense retrieval to rank fusion on 2026-04-06.", "evidence": [], "id": "train_000018", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "River-3B achieved 0.828 accuracy on SignalSet-2 for Project Anchor on 2026-04-27.", "evidence": [{"doc_id": "doc_000230", "sent_id": 2}], "id": "train_000019", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Samir Kwan was assigned as the retrieval owner for Project Sonata on 2026-05-11.", "evidence": [{"doc_id": "doc_000071", "sent_id": 2}], "id": "train_000020", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Pine-07 allocated 1 GPU to Project Sonata on 2026-06-21.", "evidence": [], "id": "train_000021", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Felix Lind was assigned as the lead for Project Aster on 2026-04-15.", "evidence": [], "id": "train_000022", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Elm-08 allocated 4 GPUs to Project Meridian on 2026-06-27.", "evidence": [{"doc_id": "doc_000488", "sent_id": 5}], "id": "train_000023", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Lumen-8B failed on Node Spruce-03 because of a missing-index error on 2026-04-02.", "evidence": [{"doc_id": "doc_000097", "sent_id": 3}], "id": "train_000024", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Aster-8B achieved a higher macro F1 than Kestrel-8B.", "evidence": [{"doc_id": "doc_000433", "sent_id": 2}, {"doc_id": "doc_000135", "sent_id": 7}], "id": "train_000025", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian changed its method from sentence pruning to query rewriting on 2026-04-29.", "evidence": [{"doc_id": "doc_000387", "sent_id": 5}], "id": "train_000026", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Cedar-7B achieved 0.786 evidence F1 on NereidNotes-3 for Project Nereid on 2026-05-30.", "evidence": [{"doc_id": "doc_000290", "sent_id": 6}], "id": "train_000027", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-01 had a run with Cedar-3B that failed because of a missing-index error on 2026-04-18 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000338", "sent_id": 2}, {"doc_id": "doc_000206", "sent_id": 5}], "id": "train_000028", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from rank fusion to teacher distillation on 2026-04-13.", "evidence": [], "id": "train_000029", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor moved the Milestone T1 deadline from 2026-04-22 to 2026-04-30 on 2026-04-01.", "evidence": [], "id": "train_000030", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid changed its method from hybrid retrieval to QLoRA adaptation on 2026-05-03.", "evidence": [{"doc_id": "doc_000015", "sent_id": 4}], "id": "train_000031", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Nested Verifier Study 4 reported on 2026-05-28 that it used cross-encoder reranking and did not use a reward model.", "evidence": [], "id": "train_000032", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-3B achieved 0.934 latency efficiency score on MemoTrace-3 for Project Anchor on 2026-05-12.", "evidence": [{"doc_id": "doc_000303", "sent_id": 7}], "id": "train_000033", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected River-3B for evidence retrieval on 2026-04-06.", "evidence": [], "id": "train_000034", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-14 recorded macro F1 for Mira-8B on TraceEval-3 using Node Sycamore-13 on 2026-04-04.", "evidence": [{"doc_id": "doc_000216", "sent_id": 4}, {"doc_id": "doc_000182", "sent_id": 6}], "id": "train_000035", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-04-14 selected Nova-3B for reranking on 2026-06-28.", "evidence": [{"doc_id": "doc_000265", "sent_id": 8}, {"doc_id": "doc_000008", "sent_id": 4}], "id": "train_000036", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Rohan Singh was assigned as the retrieval owner for Project Sonata on 2026-06-29.", "evidence": [{"doc_id": "doc_000246", "sent_id": 2}], "id": "train_000037", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Petra Gray was assigned as the evaluation owner for Project Saffron on 2026-05-05.", "evidence": [{"doc_id": "doc_000285", "sent_id": 7}], "id": "train_000038", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Theo Lind was assigned as the evaluation owner for Project Saffron on 2026-04-14.", "evidence": [{"doc_id": "doc_000255", "sent_id": 7}], "id": "train_000039", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Helix-8B achieved 0.913 macro F1 on OrionBench-2 for Project Aster on 2026-05-11.", "evidence": [{"doc_id": "doc_000337", "sent_id": 2}], "id": "train_000040", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron changed its method from chain verification to structured prompting on 2026-06-28.", "evidence": [{"doc_id": "doc_000355", "sent_id": 5}], "id": "train_000041", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Sofia Nadir was assigned as the retrieval owner for Project Sonata on 2026-06-16.", "evidence": [{"doc_id": "doc_000352", "sent_id": 7}], "id": "train_000042", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-04-08 on 2026-04-01.", "evidence": [], "id": "train_000043", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-3B achieved 0.613 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-05-24.", "evidence": [], "id": "train_000044", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Quartz-8B for claim classification on 2026-06-08.", "evidence": [], "id": "train_000045", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-06-09 had a run with Lumen-8B on Node Spruce-03 that failed because of a missing-index error on 2026-04-17.", "evidence": [{"doc_id": "doc_000351", "sent_id": 6}, {"doc_id": "doc_000287", "sent_id": 8}], "id": "train_000046", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Vela-3B achieved 0.771 macro F1 on LabQA-2 for Project Sonata on 2026-04-07.", "evidence": [{"doc_id": "doc_000393", "sent_id": 7}], "id": "train_000047", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Lumen-3B failed on Node Spruce-03 because of a checkpoint-mismatch error on 2026-05-15.", "evidence": [{"doc_id": "doc_000137", "sent_id": 8}], "id": "train_000048", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Helix-7B achieved 0.512 evidence F1 on OrionBench-2 for Project Aster on 2026-05-13.", "evidence": [], "id": "train_000049", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster moved the Milestone N1 deadline to 2026-06-09 on 2026-06-03.", "evidence": [{"doc_id": "doc_000408", "sent_id": 7}], "id": "train_000050", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Aster-3B achieved 0.675 evidence F1 on RiverBench-3 for Project Sonata on 2026-05-04.", "evidence": [{"doc_id": "doc_000020", "sent_id": 2}], "id": "train_000051", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Spruce-03 allocated 1 GPU to Project Aster on 2026-05-21.", "evidence": [], "id": "train_000052", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-28 had a failed run with Aster-8B on Node Willow-05 because of an out-of-memory error on 2026-06-11.", "evidence": [{"doc_id": "doc_000159", "sent_id": 6}, {"doc_id": "doc_000181", "sent_id": 2}], "id": "train_000053", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-04 recorded evidence F1 for Atlas-8B on RiverBench-2 using Node Poplar-12 on 2026-04-13.", "evidence": [{"doc_id": "doc_000495", "sent_id": 1}, {"doc_id": "doc_000269", "sent_id": 2}], "id": "train_000054", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-04-07 changed its method from temporal filtering to alias expansion on 2026-05-09.", "evidence": [{"doc_id": "doc_000365", "sent_id": 7}, {"doc_id": "doc_000470", "sent_id": 6}], "id": "train_000055", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Atlas-8B for evidence retrieval on 2026-04-16.", "evidence": [{"doc_id": "doc_000268", "sent_id": 3}], "id": "train_000056", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-7B achieved 0.714 accuracy on MemoTrace-2 for Project Saffron on 2026-05-04.", "evidence": [{"doc_id": "doc_000134", "sent_id": 2}], "id": "train_000057", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Evan Iyer was assigned as the retrieval owner on 2026-04-28 moved the Milestone J1 deadline to 2026-05-06 on 2026-04-02.", "evidence": [{"doc_id": "doc_000039", "sent_id": 7}, {"doc_id": "doc_000209", "sent_id": 3}], "id": "train_000058", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Elm-08 allocated 4 GPUs to Project Meridian on 2026-05-24.", "evidence": [], "id": "train_000059", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from sentence pruning to query rewriting on 2026-04-12.", "evidence": [{"doc_id": "doc_000190", "sent_id": 4}], "id": "train_000060", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-06-08 changed its method from late interaction to BM25 retrieval on 2026-06-28.", "evidence": [{"doc_id": "doc_000204", "sent_id": 2}, {"doc_id": "doc_000308", "sent_id": 5}], "id": "train_000061", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-04-25.", "evidence": [], "id": "train_000062", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared macro F1 runs, Nimbus-8B achieved a higher macro F1 than Kestrel-8B.", "evidence": [{"doc_id": "doc_000391", "sent_id": 2}, {"doc_id": "doc_000135", "sent_id": 7}], "id": "train_000063", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster's run with Finch-3B failed because of an out-of-memory error on 2026-06-18 while using Node Sycamore-13.", "evidence": [{"doc_id": "doc_000125", "sent_id": 3}], "id": "train_000064", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-06-23 selected Lumen-8B for error analysis on 2026-04-05.", "evidence": [{"doc_id": "doc_000259", "sent_id": 7}, {"doc_id": "doc_000484", "sent_id": 4}], "id": "train_000065", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Ravi Reed was assigned as the evaluation owner for Project Meridian on 2026-04-29.", "evidence": [], "id": "train_000066", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected Helix-7B for claim classification on 2026-05-04.", "evidence": [], "id": "train_000067", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-06-27 to 2026-07-05 on 2026-06-03.", "evidence": [{"doc_id": "doc_000267", "sent_id": 5}], "id": "train_000068", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-05-20 to 2026-05-24 on 2026-05-09.", "evidence": [], "id": "train_000069", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-23 selected Nimbus-7B for calibration on 2026-04-21.", "evidence": [{"doc_id": "doc_000204", "sent_id": 7}, {"doc_id": "doc_000464", "sent_id": 7}], "id": "train_000070", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Delta Evidence Study 1 reported on 2026-06-01 that it used document chunking and used a reward model.", "evidence": [{"doc_id": "doc_000301", "sent_id": 1}], "id": "train_000071", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Helix-7B achieved 0.628 latency efficiency score on CedarQA-3 for Project Anchor on 2026-04-07.", "evidence": [{"doc_id": "doc_000422", "sent_id": 7}], "id": "train_000072", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Jonas Nolan was assigned as the retrieval owner for Project Sonata on 2026-05-18.", "evidence": [{"doc_id": "doc_000323", "sent_id": 2}], "id": "train_000073", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Kestrel-3B failed because of an out-of-memory error on 2026-06-25 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000393", "sent_id": 3}], "id": "train_000074", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-8B for evidence retrieval on 2026-06-26.", "evidence": [{"doc_id": "doc_000405", "sent_id": 8}], "id": "train_000075", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from chain verification to confidence calibration on 2026-04-01.", "evidence": [{"doc_id": "doc_000422", "sent_id": 5}], "id": "train_000076", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-06-08 recorded latency efficiency score for Finch-8B on VestaLogs using Node Juniper-06 on 2026-04-21.", "evidence": [{"doc_id": "doc_000225", "sent_id": 2}, {"doc_id": "doc_000302", "sent_id": 7}], "id": "train_000077", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Marble-7B for error analysis on 2026-05-08.", "evidence": [{"doc_id": "doc_000238", "sent_id": 9}], "id": "train_000078", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-06-30 moved the Milestone J1 deadline from 2026-05-24 to 2026-06-03 on 2026-05-14.", "evidence": [{"doc_id": "doc_000465", "sent_id": 7}, {"doc_id": "doc_000104", "sent_id": 3}], "id": "train_000079", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Priya Vale was assigned as the evaluation owner for Project Saffron on 2026-06-29.", "evidence": [{"doc_id": "doc_000070", "sent_id": 2}], "id": "train_000080", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Finch-7B for error analysis on 2026-05-22.", "evidence": [{"doc_id": "doc_000117", "sent_id": 8}], "id": "train_000081", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-21 received 6 GPUs from Node Fir-10 on 2026-06-26.", "evidence": [{"doc_id": "doc_000271", "sent_id": 7}, {"doc_id": "doc_000345", "sent_id": 8}], "id": "train_000082", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-8B achieved 0.872 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-06-09.", "evidence": [{"doc_id": "doc_000140", "sent_id": 7}], "id": "train_000083", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Marble-3B achieved 0.672 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-04-06.", "evidence": [{"doc_id": "doc_000455", "sent_id": 2}], "id": "train_000084", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-05-26 received 3 GPUs from Node Sycamore-13 on 2026-04-02.", "evidence": [{"doc_id": "doc_000138", "sent_id": 7}, {"doc_id": "doc_000012", "sent_id": 3}], "id": "train_000085", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Finch-3B achieved 0.592 latency efficiency score on VestaLogs for Project Anchor on 2026-05-05.", "evidence": [{"doc_id": "doc_000387", "sent_id": 7}], "id": "train_000086", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-06-22 moved the Milestone B2 deadline from 2026-06-15 to 2026-06-06 on 2026-05-28.", "evidence": [{"doc_id": "doc_000365", "sent_id": 2}, {"doc_id": "doc_000160", "sent_id": 3}], "id": "train_000087", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Anika Costa was assigned as the retrieval owner on 2026-05-19 selected Orchid-7B for reranking on 2026-05-08.", "evidence": [{"doc_id": "doc_000018", "sent_id": 7}, {"doc_id": "doc_000133", "sent_id": 8}], "id": "train_000088", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Atlas-7B for claim classification on 2026-06-21.", "evidence": [{"doc_id": "doc_000108", "sent_id": 4}], "id": "train_000089", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid moved the Milestone D1 deadline to 2026-06-21 on 2026-06-14.", "evidence": [], "id": "train_000090", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Lumen-7B for calibration on 2026-06-19.", "evidence": [{"doc_id": "doc_000352", "sent_id": 8}], "id": "train_000091", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-04-20 selected Cedar-7B for reranking on 2026-04-09.", "evidence": [{"doc_id": "doc_000346", "sent_id": 2}, {"doc_id": "doc_000190", "sent_id": 3}], "id": "train_000092", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Atlas-7B for evidence retrieval on 2026-05-15.", "evidence": [{"doc_id": "doc_000015", "sent_id": 8}], "id": "train_000093", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone X1 deadline from 2026-06-22 to 2026-06-30 on 2026-06-11.", "evidence": [], "id": "train_000094", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Sycamore-13 allocated 2 GPUs to Project Saffron on 2026-04-05.", "evidence": [], "id": "train_000095", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Mira-3B achieved a higher macro F1 than Quartz-8B.", "evidence": [{"doc_id": "doc_000353", "sent_id": 2}, {"doc_id": "doc_000240", "sent_id": 7}], "id": "train_000096", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from reward reranking to threshold search on 2026-06-03.", "evidence": [{"doc_id": "doc_000486", "sent_id": 5}], "id": "train_000097", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Poplar-12 allocated 3 GPUs to Project Aster on 2026-06-27.", "evidence": [], "id": "train_000098", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nimbus-7B achieved 0.779 macro F1 on TraceEval-2 for Project Nereid on 2026-05-04.", "evidence": [{"doc_id": "doc_000489", "sent_id": 2}], "id": "train_000099", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Felix Brooks was assigned as the retrieval owner for Project Aster on 2026-04-13.", "evidence": [{"doc_id": "doc_000402", "sent_id": 2}], "id": "train_000100", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nimbus-7B for calibration on 2026-04-20.", "evidence": [], "id": "train_000101", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nimbus-7B achieved 0.558 macro F1 on TraceEval-2 for Project Nereid on 2026-06-17.", "evidence": [], "id": "train_000102", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Owen Torres was assigned as the evaluation owner for Project Saffron on 2026-06-08.", "evidence": [{"doc_id": "doc_000376", "sent_id": 2}], "id": "train_000103", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Lumen-7B failed because of an out-of-memory error on 2026-04-09 while using Node Cedar-02.", "evidence": [{"doc_id": "doc_000455", "sent_id": 3}], "id": "train_000104", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-04-14 changed its method from hybrid retrieval to QLoRA adaptation on 2026-06-20.", "evidence": [{"doc_id": "doc_000209", "sent_id": 7}, {"doc_id": "doc_000494", "sent_id": 6}], "id": "train_000105", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-06-23 moved the Milestone F1 deadline to 2026-07-11 on 2026-06-23.", "evidence": [{"doc_id": "doc_000042", "sent_id": 7}, {"doc_id": "doc_000216", "sent_id": 7}], "id": "train_000106", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Marble-8B achieved a higher macro F1 than Vela-8B.", "evidence": [{"doc_id": "doc_000033", "sent_id": 2}, {"doc_id": "doc_000312", "sent_id": 7}], "id": "train_000107", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-06-08.", "evidence": [{"doc_id": "doc_000427", "sent_id": 2}], "id": "train_000108", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Nova-3B achieved 0.750 evidence F1 on LabQA for Project Nereid on 2026-06-27.", "evidence": [{"doc_id": "doc_000118", "sent_id": 6}], "id": "train_000109", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-05-25 had a failed run with Marble-8B on Node Maple-01 because of a checkpoint-mismatch error on 2026-05-14.", "evidence": [{"doc_id": "doc_000113", "sent_id": 1}, {"doc_id": "doc_000297", "sent_id": 3}], "id": "train_000110", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Lumen-3B failed on Node Spruce-03 because of an unstable-validation-loss error on 2026-04-10.", "evidence": [{"doc_id": "doc_000253", "sent_id": 8}], "id": "train_000111", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from metric smoothing to hard-negative mining on 2026-05-15.", "evidence": [], "id": "train_000112", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-3B for evidence retrieval on 2026-06-19.", "evidence": [{"doc_id": "doc_000169", "sent_id": 8}], "id": "train_000113", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster's run with River-8B failed on Node Rowan-09 because of an out-of-memory error on 2026-06-05.", "evidence": [{"doc_id": "doc_000067", "sent_id": 8}], "id": "train_000114", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Kestrel-3B for claim classification on 2026-05-04.", "evidence": [], "id": "train_000115", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Helix-8B failed on Node Poplar-12 because of a checkpoint-mismatch error on 2026-05-14.", "evidence": [{"doc_id": "doc_000213", "sent_id": 3}], "id": "train_000116", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-30 received 5 GPUs from Node Hazel-14 on 2026-05-18.", "evidence": [{"doc_id": "doc_000288", "sent_id": 10}, {"doc_id": "doc_000333", "sent_id": 2}], "id": "train_000117", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone X1 deadline from 2026-06-22 to 2026-06-30 on 2026-06-10.", "evidence": [{"doc_id": "doc_000037", "sent_id": 5}], "id": "train_000118", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-04 had a run with Kestrel-3B that failed because of an unstable-validation-loss error on 2026-06-04 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000495", "sent_id": 1}, {"doc_id": "doc_000356", "sent_id": 3}], "id": "train_000119", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, River-7B achieved a higher accuracy than Nova-7B.", "evidence": [{"doc_id": "doc_000241", "sent_id": 6}, {"doc_id": "doc_000316", "sent_id": 7}], "id": "train_000120", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-04-21 moved the Milestone X1 deadline to 2026-05-08 on 2026-04-21.", "evidence": [{"doc_id": "doc_000145", "sent_id": 7}, {"doc_id": "doc_000186", "sent_id": 6}], "id": "train_000121", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Cedar-3B failed on Node Laurel-11 because of a missing-index error on 2026-04-10.", "evidence": [{"doc_id": "doc_000318", "sent_id": 8}], "id": "train_000122", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Ravi Reed was assigned as the evaluation owner on 2026-06-09 selected Aster-7B for calibration on 2026-06-28.", "evidence": [{"doc_id": "doc_000384", "sent_id": 9}, {"doc_id": "doc_000139", "sent_id": 4}], "id": "train_000123", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mira-8B achieved 0.748 evidence F1 on TraceEval-3 for Project Sonata on 2026-04-27.", "evidence": [], "id": "train_000124", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian changed its method from LoRA adaptation to alias expansion on 2026-04-10.", "evidence": [], "id": "train_000125", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron's run with Marble-8B failed because of a checkpoint-mismatch error on 2026-06-25 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000318", "sent_id": 3}], "id": "train_000126", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-06-29 changed its method from hard-negative mining to metric smoothing on 2026-04-09.", "evidence": [{"doc_id": "doc_000209", "sent_id": 2}, {"doc_id": "doc_000464", "sent_id": 3}], "id": "train_000127", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-01 recorded latency efficiency score for Nova-7B on LumenFacts-2 using Node Elm-08 on 2026-04-25.", "evidence": [{"doc_id": "doc_000338", "sent_id": 2}, {"doc_id": "doc_000268", "sent_id": 6}], "id": "train_000128", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-04-20 selected Aster-8B for evidence retrieval on 2026-06-04.", "evidence": [{"doc_id": "doc_000346", "sent_id": 2}, {"doc_id": "doc_000169", "sent_id": 3}], "id": "train_000129", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian had a failed run with Finch-8B on Node Laurel-11 because of a checkpoint-mismatch error on 2026-06-12.", "evidence": [{"doc_id": "doc_000413", "sent_id": 7}], "id": "train_000130", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-05-14 on 2026-04-22.", "evidence": [], "id": "train_000131", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-05-04 had a run with Lumen-3B on Node Spruce-03 that failed because of an out-of-memory error on 2026-04-16.", "evidence": [{"doc_id": "doc_000023", "sent_id": 2}, {"doc_id": "doc_000024", "sent_id": 3}], "id": "train_000132", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Mira-8B for calibration on 2026-05-15.", "evidence": [{"doc_id": "doc_000165", "sent_id": 8}], "id": "train_000133", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster changed its method from hard-negative mining to metric smoothing on 2026-05-06.", "evidence": [{"doc_id": "doc_000230", "sent_id": 5}], "id": "train_000134", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Cedar-7B for error analysis on 2026-05-15.", "evidence": [{"doc_id": "doc_000395", "sent_id": 8}], "id": "train_000135", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Marble-3B achieved a higher macro F1 than Nova-7B.", "evidence": [{"doc_id": "doc_000461", "sent_id": 6}, {"doc_id": "doc_000259", "sent_id": 6}], "id": "train_000136", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron's run with Mira-8B failed on Node Sycamore-13 because of a missing-index error on 2026-04-24.", "evidence": [{"doc_id": "doc_000455", "sent_id": 8}], "id": "train_000137", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-04-14 had a run with Helix-8B on Node Poplar-12 that failed because of a missing-index error on 2026-05-16.", "evidence": [{"doc_id": "doc_000233", "sent_id": 7}, {"doc_id": "doc_000360", "sent_id": 5}], "id": "train_000138", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone N1 deadline to 2026-06-08 on 2026-05-13.", "evidence": [{"doc_id": "doc_000117", "sent_id": 5}], "id": "train_000139", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-05-25 recorded latency efficiency score for Quartz-7B on CedarQA-2 using Node Aspen-01 on 2026-06-08.", "evidence": [{"doc_id": "doc_000113", "sent_id": 1}, {"doc_id": "doc_000135", "sent_id": 2}], "id": "train_000140", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-06-23 had a run with Marble-3B that failed because of an unstable-validation-loss error on 2026-04-09 while using Node Maple-01.", "evidence": [{"doc_id": "doc_000042", "sent_id": 7}, {"doc_id": "doc_000026", "sent_id": 3}], "id": "train_000141", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-04-29.", "evidence": [], "id": "train_000142", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-3B achieved 0.562 macro F1 on OrionBench for Project Sonata on 2026-05-02.", "evidence": [{"doc_id": "doc_000298", "sent_id": 6}], "id": "train_000143", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Cedar-3B failed because of an unstable-validation-loss error on 2026-04-16 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000254", "sent_id": 3}], "id": "train_000144", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with Lumen-7B failed on Node Spruce-03 because of a checkpoint-mismatch error on 2026-04-12.", "evidence": [], "id": "train_000145", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Anika Sato was assigned as the lead for Project Aster on 2026-05-13.", "evidence": [], "id": "train_000146", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from contrastive tuning to threshold search on 2026-05-22.", "evidence": [], "id": "train_000147", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata had a failed run with Quartz-7B on Node Pine-07 because of an out-of-memory error on 2026-04-10.", "evidence": [{"doc_id": "doc_000281", "sent_id": 7}], "id": "train_000148", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron changed its method from rank fusion to teacher distillation on 2026-04-19.", "evidence": [{"doc_id": "doc_000268", "sent_id": 4}], "id": "train_000149", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-04-14 recorded macro F1 for Vela-7B on LabQA-2 using Node Fir-10 on 2026-06-09.", "evidence": [{"doc_id": "doc_000327", "sent_id": 7}, {"doc_id": "doc_000079", "sent_id": 7}], "id": "train_000150", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid changed its method from evidence pooling to BM25 retrieval on 2026-06-10.", "evidence": [{"doc_id": "doc_000334", "sent_id": 5}], "id": "train_000151", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 3 GPUs to Project Sonata on 2026-04-18.", "evidence": [{"doc_id": "doc_000026", "sent_id": 6}], "id": "train_000152", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Spruce-03 allocated 5 GPUs to Project Aster on 2026-04-18.", "evidence": [{"doc_id": "doc_000278", "sent_id": 6}], "id": "train_000153", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-05-09 on 2026-04-12.", "evidence": [], "id": "train_000154", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster changed its method from sentence pruning to query rewriting on 2026-05-20.", "evidence": [{"doc_id": "doc_000344", "sent_id": 4}], "id": "train_000155", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from confidence calibration to calibrated voting on 2026-05-13.", "evidence": [{"doc_id": "doc_000127", "sent_id": 5}], "id": "train_000156", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian's run with Nova-8B failed because of a missing-index error on 2026-05-28 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000349", "sent_id": 3}], "id": "train_000157", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Leo Park was assigned as the evaluation owner for Project Anchor on 2026-04-21.", "evidence": [{"doc_id": "doc_000061", "sent_id": 7}], "id": "train_000158", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from threshold search to reward reranking on 2026-06-12.", "evidence": [], "id": "train_000159", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Lumen-8B for calibration on 2026-05-16.", "evidence": [], "id": "train_000160", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Owen Marin was assigned as the data steward for Project Saffron on 2026-06-01.", "evidence": [{"doc_id": "doc_000481", "sent_id": 2}], "id": "train_000161", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Darian Hale was assigned as the retrieval owner for Project Nereid on 2026-06-28.", "evidence": [], "id": "train_000162", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Finch-8B achieved 0.937 macro F1 on NereidNotes-2 for Project Aster on 2026-06-22.", "evidence": [{"doc_id": "doc_000253", "sent_id": 2}], "id": "train_000163", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-05-18 on 2026-04-22.", "evidence": [{"doc_id": "doc_000480", "sent_id": 5}], "id": "train_000164", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-04-07 moved the Milestone H1 deadline to 2026-06-26 on 2026-06-12.", "evidence": [{"doc_id": "doc_000365", "sent_id": 7}, {"doc_id": "doc_000160", "sent_id": 8}], "id": "train_000165", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Stable Chains Study 2 reported on 2026-04-16 that it used chain verification and did not use a reward model.", "evidence": [], "id": "train_000166", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata selected Mira-3B for calibration on 2026-04-09.", "evidence": [{"doc_id": "doc_000437", "sent_id": 3}], "id": "train_000167", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 4 GPUs to Project Meridian on 2026-05-09.", "evidence": [{"doc_id": "doc_000230", "sent_id": 6}], "id": "train_000168", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Aster-7B achieved 0.649 latency efficiency score on SignalSet for Project Saffron on 2026-04-25.", "evidence": [{"doc_id": "doc_000064", "sent_id": 6}], "id": "train_000169", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Quartz-8B for reranking on 2026-05-17.", "evidence": [{"doc_id": "doc_000492", "sent_id": 4}], "id": "train_000170", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-04-28 selected Vela-3B for reranking on 2026-06-11.", "evidence": [{"doc_id": "doc_000402", "sent_id": 7}, {"doc_id": "doc_000237", "sent_id": 3}], "id": "train_000171", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Aspen-01 allocated 4 GPUs to Project Meridian on 2026-04-04.", "evidence": [{"doc_id": "doc_000295", "sent_id": 6}], "id": "train_000172", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-8B achieved 0.586 latency efficiency score on LumenFacts for Project Anchor on 2026-06-22.", "evidence": [{"doc_id": "doc_000393", "sent_id": 2}], "id": "train_000173", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Leo Hale was assigned as the data steward for Project Anchor on 2026-05-06.", "evidence": [], "id": "train_000174", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-07-16 to 2026-07-26 on 2026-06-24.", "evidence": [{"doc_id": "doc_000248", "sent_id": 5}], "id": "train_000175", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Cedar-8B achieved 0.756 macro F1 on NereidNotes-3 for Project Nereid on 2026-05-23.", "evidence": [{"doc_id": "doc_000326", "sent_id": 8}], "id": "train_000176", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Mira-8B for calibration on 2026-06-01.", "evidence": [], "id": "train_000177", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Atlas-7B achieved 0.860 latency efficiency score on SignalSet-3 for Project Meridian on 2026-05-19.", "evidence": [{"doc_id": "doc_000020", "sent_id": 7}], "id": "train_000178", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-04-27 selected Vela-7B for reranking on 2026-06-04.", "evidence": [{"doc_id": "doc_000165", "sent_id": 2}, {"doc_id": "doc_000461", "sent_id": 3}], "id": "train_000179", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-7B achieved 0.577 latency efficiency score on CedarQA-2 for Project Saffron on 2026-06-20.", "evidence": [{"doc_id": "doc_000427", "sent_id": 6}], "id": "train_000180", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Nadia Singh was assigned as the data steward on 2026-06-29 selected Quartz-7B for calibration on 2026-04-09.", "evidence": [{"doc_id": "doc_000292", "sent_id": 2}, {"doc_id": "doc_000019", "sent_id": 3}], "id": "train_000181", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-06-07 on 2026-05-27.", "evidence": [{"doc_id": "doc_000177", "sent_id": 5}], "id": "train_000182", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 4 reported on 2026-05-06 that it used late interaction and used a reward model.", "evidence": [{"doc_id": "doc_000075", "sent_id": 5}], "id": "train_000183", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Owen Marin was assigned as the data steward for Project Saffron on 2026-06-09.", "evidence": [{"doc_id": "doc_000375", "sent_id": 7}], "id": "train_000184", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Finch-7B failed on Node Juniper-06 because of a missing-index error on 2026-06-11.", "evidence": [{"doc_id": "doc_000261", "sent_id": 3}], "id": "train_000185", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Priya Moss was assigned as the data steward for Project Saffron on 2026-05-06.", "evidence": [], "id": "train_000186", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Kestrel-3B for claim classification on 2026-05-15.", "evidence": [{"doc_id": "doc_000372", "sent_id": 8}], "id": "train_000187", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Sycamore-13 allocated 3 GPUs to Project Sonata on 2026-06-06.", "evidence": [{"doc_id": "doc_000474", "sent_id": 6}], "id": "train_000188", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Nimbus-8B for calibration on 2026-06-05.", "evidence": [{"doc_id": "doc_000491", "sent_id": 8}], "id": "train_000189", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Kira Frost was assigned as the data steward for Project Anchor on 2026-06-09.", "evidence": [{"doc_id": "doc_000267", "sent_id": 7}], "id": "train_000190", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-05-26 changed its method from late interaction to cross-encoder reranking on 2026-05-10.", "evidence": [{"doc_id": "doc_000319", "sent_id": 7}, {"doc_id": "doc_000495", "sent_id": 3}], "id": "train_000191", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 3 reported on 2026-05-30 that it used contrastive tuning and did not use a reward model.", "evidence": [], "id": "train_000192", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Atlas-3B for claim classification on 2026-04-19.", "evidence": [{"doc_id": "doc_000183", "sent_id": 3}], "id": "train_000193", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron's run with Aster-8B failed on Node Willow-05 because of a missing-index error on 2026-06-12.", "evidence": [{"doc_id": "doc_000207", "sent_id": 8}], "id": "train_000194", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-06-23 selected Marble-7B for error analysis on 2026-05-14.", "evidence": [{"doc_id": "doc_000259", "sent_id": 7}, {"doc_id": "doc_000315", "sent_id": 3}], "id": "train_000195", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Kestrel-8B achieved a higher evidence F1 than Orchid-8B.", "evidence": [{"doc_id": "doc_000032", "sent_id": 2}, {"doc_id": "doc_000328", "sent_id": 6}], "id": "train_000196", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Vela-7B achieved 0.580 evidence F1 on LabQA-2 for Project Sonata on 2026-04-18.", "evidence": [{"doc_id": "doc_000114", "sent_id": 6}], "id": "train_000197", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-05-25 had a failed run with Nova-3B on Node Aspen-01 because of a checkpoint-mismatch error on 2026-04-17.", "evidence": [{"doc_id": "doc_000351", "sent_id": 1}, {"doc_id": "doc_000040", "sent_id": 8}], "id": "train_000198", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Birch-04 allocated 1 GPU to Project Nereid on 2026-04-23.", "evidence": [], "id": "train_000199", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from temporal filtering to alias expansion on 2026-05-03.", "evidence": [{"doc_id": "doc_000372", "sent_id": 4}], "id": "train_000200", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Priya Moss was assigned as the data steward for Project Saffron on 2026-06-30.", "evidence": [{"doc_id": "doc_000118", "sent_id": 7}], "id": "train_000201", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Orchid-7B achieved a higher macro F1 than Cedar-7B.", "evidence": [{"doc_id": "doc_000220", "sent_id": 9}, {"doc_id": "doc_000400", "sent_id": 6}], "id": "train_000202", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Mina Torres was assigned as the evaluation owner for Project Saffron on 2026-06-30.", "evidence": [{"doc_id": "doc_000248", "sent_id": 7}], "id": "train_000203", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Camila Brooks was assigned as the data steward for Project Meridian on 2026-06-03.", "evidence": [], "id": "train_000204", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Anika Costa was assigned as the retrieval owner on 2026-05-19 selected Lumen-3B for calibration on 2026-05-07.", "evidence": [{"doc_id": "doc_000018", "sent_id": 7}, {"doc_id": "doc_000162", "sent_id": 3}], "id": "train_000205", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Stable Chains Study 2 reported on 2026-05-18 that it used BM25 retrieval and did not use a reward model.", "evidence": [{"doc_id": "doc_000418", "sent_id": 2}], "id": "train_000206", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-05-20 to 2026-05-26 on 2026-04-25.", "evidence": [], "id": "train_000207", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Nimbus-3B for calibration on 2026-06-07.", "evidence": [{"doc_id": "doc_000007", "sent_id": 4}], "id": "train_000208", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Lumen-7B achieved 0.919 evidence F1 on TraceEval for Project Aster on 2026-04-06.", "evidence": [{"doc_id": "doc_000002", "sent_id": 2}], "id": "train_000209", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-05-23.", "evidence": [{"doc_id": "doc_000337", "sent_id": 6}], "id": "train_000210", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Dr. Arun Bauer was assigned as the evaluation owner for Project Anchor on 2026-05-19.", "evidence": [{"doc_id": "doc_000401", "sent_id": 7}], "id": "train_000211", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-30 received 3 GPUs from Node Aspen-01 on 2026-05-09.", "evidence": [{"doc_id": "doc_000288", "sent_id": 10}, {"doc_id": "doc_000147", "sent_id": 6}], "id": "train_000212", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, River-7B achieved a higher accuracy than Mira-8B.", "evidence": [{"doc_id": "doc_000241", "sent_id": 6}, {"doc_id": "doc_000177", "sent_id": 6}], "id": "train_000213", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-05-25 recorded macro F1 for Nova-3B on LabQA using Node Aspen-01 on 2026-05-19.", "evidence": [{"doc_id": "doc_000388", "sent_id": 2}, {"doc_id": "doc_000450", "sent_id": 7}], "id": "train_000214", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from rank fusion to teacher distillation on 2026-05-28.", "evidence": [], "id": "train_000215", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Spruce-03 allocated 3 GPUs to Project Aster on 2026-05-26.", "evidence": [], "id": "train_000216", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-26 changed its method from chain verification to structured prompting on 2026-06-24.", "evidence": [{"doc_id": "doc_000090", "sent_id": 7}, {"doc_id": "doc_000049", "sent_id": 5}], "id": "train_000217", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-04-10 that it used LoRA adaptation and did not use a reward model.", "evidence": [{"doc_id": "doc_000008", "sent_id": 8}], "id": "train_000218", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Jonas Nolan was assigned as the retrieval owner for Project Sonata on 2026-05-27.", "evidence": [], "id": "train_000219", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Kestrel-7B for claim classification on 2026-04-24.", "evidence": [{"doc_id": "doc_000061", "sent_id": 8}], "id": "train_000220", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata's run with Vela-3B failed because of an out-of-memory error on 2026-06-25 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000228", "sent_id": 3}], "id": "train_000221", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Laurel-11 allocated 4 GPUs to Project Meridian on 2026-06-06.", "evidence": [{"doc_id": "doc_000486", "sent_id": 6}], "id": "train_000222", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mina Torres was assigned as the data steward for Project Meridian on 2026-04-06.", "evidence": [{"doc_id": "doc_000190", "sent_id": 2}], "id": "train_000223", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Cedar-8B achieved a higher evidence F1 than River-8B.", "evidence": [{"doc_id": "doc_000387", "sent_id": 2}, {"doc_id": "doc_000006", "sent_id": 6}], "id": "train_000224", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-20 received 5 GPUs from Node Elm-08 on 2026-05-09.", "evidence": [{"doc_id": "doc_000004", "sent_id": 2}, {"doc_id": "doc_000466", "sent_id": 6}], "id": "train_000225", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-04-08 that it used data mixing and used a reward model.", "evidence": [{"doc_id": "doc_000235", "sent_id": 5}], "id": "train_000226", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron had a failed run with Marble-3B on Node Pine-07 because of a missing-index error on 2026-05-29.", "evidence": [{"doc_id": "doc_000213", "sent_id": 8}], "id": "train_000227", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Camila Quinn was assigned as the evaluation owner for Project Meridian on 2026-04-07.", "evidence": [{"doc_id": "doc_000123", "sent_id": 7}], "id": "train_000228", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Kestrel-3B for claim classification on 2026-04-13.", "evidence": [], "id": "train_000229", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata changed its method from evidence pooling to BM25 retrieval on 2026-04-22.", "evidence": [{"doc_id": "doc_000024", "sent_id": 5}], "id": "train_000230", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared evidence F1 runs, Atlas-3B achieved a higher evidence F1 than Helix-3B.", "evidence": [{"doc_id": "doc_000120", "sent_id": 2}, {"doc_id": "doc_000143", "sent_id": 6}], "id": "train_000231", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-05-15 that it used hybrid retrieval and used a reward model.", "evidence": [{"doc_id": "doc_000001", "sent_id": 8}], "id": "train_000232", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Samir Ames was assigned as the lead for Project Sonata on 2026-05-13.", "evidence": [], "id": "train_000233", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-11 changed its method from late interaction to BM25 retrieval on 2026-04-19.", "evidence": [{"doc_id": "doc_000319", "sent_id": 2}, {"doc_id": "doc_000480", "sent_id": 4}], "id": "train_000234", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-06-15 selected Marble-8B for claim classification on 2026-05-20.", "evidence": [{"doc_id": "doc_000193", "sent_id": 2}, {"doc_id": "doc_000286", "sent_id": 5}], "id": "train_000235", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-05-18 to 2026-05-22 on 2026-05-07.", "evidence": [], "id": "train_000236", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mara Lane was assigned as the retrieval owner for Project Sonata on 2026-05-04.", "evidence": [{"doc_id": "doc_000284", "sent_id": 2}], "id": "train_000237", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Cedar-8B for error analysis on 2026-04-06.", "evidence": [], "id": "train_000238", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Helix-3B failed because of an unstable-validation-loss error on 2026-05-22 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000127", "sent_id": 8}], "id": "train_000239", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Atlas-7B for reranking on 2026-04-09.", "evidence": [{"doc_id": "doc_000190", "sent_id": 3}], "id": "train_000240", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Aster-3B for evidence retrieval on 2026-06-08.", "evidence": [], "id": "train_000241", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Evan Moss was assigned as the lead for Project Nereid on 2026-06-22.", "evidence": [{"doc_id": "doc_000182", "sent_id": 2}], "id": "train_000242", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid had a failed run with River-7B on Node Aspen-01 because of an unstable-validation-loss error on 2026-05-22.", "evidence": [{"doc_id": "doc_000489", "sent_id": 8}], "id": "train_000243", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Vela-3B for reranking on 2026-06-29.", "evidence": [], "id": "train_000244", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-06-23 moved the Milestone L1 deadline from 2026-07-12 to 2026-07-20 on 2026-06-26.", "evidence": [{"doc_id": "doc_000042", "sent_id": 7}, {"doc_id": "doc_000038", "sent_id": 8}], "id": "train_000245", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from LoRA adaptation to alias expansion on 2026-04-06.", "evidence": [], "id": "train_000246", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from cross-encoder reranking to late interaction on 2026-04-13.", "evidence": [], "id": "train_000247", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Petra Adler was assigned as the data steward for Project Saffron on 2026-05-04.", "evidence": [{"doc_id": "doc_000162", "sent_id": 2}], "id": "train_000248", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-23 recorded evidence F1 for Marble-3B on NereidNotes using Node Maple-01 on 2026-05-05.", "evidence": [{"doc_id": "doc_000225", "sent_id": 7}, {"doc_id": "doc_000296", "sent_id": 7}], "id": "train_000249", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Marble-7B achieved a higher macro F1 than Vela-7B.", "evidence": [{"doc_id": "doc_000194", "sent_id": 2}, {"doc_id": "doc_000079", "sent_id": 7}], "id": "train_000250", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster's run with River-7B failed because of an out-of-memory error on 2026-05-28 while using Node Sycamore-13.", "evidence": [{"doc_id": "doc_000207", "sent_id": 3}], "id": "train_000251", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Quartz-3B for claim classification on 2026-04-11.", "evidence": [], "id": "train_000252", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Fir-10 allocated 2 GPUs to Project Saffron on 2026-04-12.", "evidence": [], "id": "train_000253", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Pale Compass Study 3 reported on 2026-05-05 that it used reward reranking and did not use a reward model.", "evidence": [], "id": "train_000254", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Vera Torres was assigned as the retrieval owner for Project Nereid on 2026-06-02.", "evidence": [{"doc_id": "doc_000403", "sent_id": 7}], "id": "train_000255", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-04-14 received 5 GPUs from Node Sycamore-13 on 2026-05-02.", "evidence": [{"doc_id": "doc_000327", "sent_id": 7}, {"doc_id": "doc_000362", "sent_id": 6}], "id": "train_000256", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-06-13.", "evidence": [], "id": "train_000257", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Quartz-3B for claim classification on 2026-05-08.", "evidence": [{"doc_id": "doc_000346", "sent_id": 8}], "id": "train_000258", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Finch-3B achieved a higher evidence F1 than Helix-3B.", "evidence": [{"doc_id": "doc_000472", "sent_id": 7}, {"doc_id": "doc_000223", "sent_id": 2}], "id": "train_000259", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian's run with Nova-7B failed because of a checkpoint-mismatch error on 2026-04-10 while using Node Rowan-09.", "evidence": [{"doc_id": "doc_000228", "sent_id": 8}], "id": "train_000260", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-06-01 moved the Milestone H1 deadline from 2026-07-02 to 2026-06-26 on 2026-06-12.", "evidence": [{"doc_id": "doc_000378", "sent_id": 2}, {"doc_id": "doc_000160", "sent_id": 8}], "id": "train_000261", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 2 reported on 2026-06-20 that it used calibrated voting and did not use a reward model.", "evidence": [], "id": "train_000262", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-04-27 that it used QLoRA adaptation and did not use a reward model.", "evidence": [{"doc_id": "doc_000001", "sent_id": 2}], "id": "train_000263", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Samir Kwan was assigned as the retrieval owner for Project Sonata on 2026-05-19.", "evidence": [{"doc_id": "doc_000316", "sent_id": 8}], "id": "train_000264", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone T1 deadline from 2026-06-06 to 2026-06-12 on 2026-05-27.", "evidence": [{"doc_id": "doc_000476", "sent_id": 6}], "id": "train_000265", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-04-27 to 2026-05-03 on 2026-04-01.", "evidence": [{"doc_id": "doc_000483", "sent_id": 6}], "id": "train_000266", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-01 received 5 GPUs from Node Juniper-06 on 2026-06-13.", "evidence": [{"doc_id": "doc_000463", "sent_id": 2}, {"doc_id": "doc_000218", "sent_id": 6}], "id": "train_000267", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-05-05 changed its method from rank fusion to dense retrieval on 2026-05-17.", "evidence": [{"doc_id": "doc_000368", "sent_id": 7}, {"doc_id": "doc_000357", "sent_id": 4}], "id": "train_000268", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor moved the Milestone T1 deadline from 2026-06-23 to 2026-06-27 on 2026-06-03.", "evidence": [{"doc_id": "doc_000375", "sent_id": 5}], "id": "train_000269", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Petra Adler was assigned as the data steward for Project Saffron on 2026-06-03.", "evidence": [], "id": "train_000270", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Cedar-8B failed because of an out-of-memory error on 2026-04-17 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000242", "sent_id": 8}], "id": "train_000271", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Lumen-3B for calibration on 2026-05-07.", "evidence": [{"doc_id": "doc_000162", "sent_id": 3}], "id": "train_000272", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Fir-10 allocated 3 GPUs to Project Sonata on 2026-06-27.", "evidence": [{"doc_id": "doc_000089", "sent_id": 5}], "id": "train_000273", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, River-3B achieved a higher macro F1 than Kestrel-3B.", "evidence": [{"doc_id": "doc_000243", "sent_id": 7}, {"doc_id": "doc_000332", "sent_id": 3}], "id": "train_000274", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-06-22 selected Quartz-8B for error analysis on 2026-06-17.", "evidence": [{"doc_id": "doc_000119", "sent_id": 2}, {"doc_id": "doc_000204", "sent_id": 5}], "id": "train_000275", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone T1 deadline to 2026-05-03 on 2026-04-22.", "evidence": [{"doc_id": "doc_000289", "sent_id": 5}], "id": "train_000276", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-05-05 recorded latency efficiency score for Helix-3B on CedarQA-3 using Node Poplar-12 on 2026-06-29.", "evidence": [{"doc_id": "doc_000004", "sent_id": 9}, {"doc_id": "doc_000484", "sent_id": 2}], "id": "train_000277", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-26 had a failed run with Vela-8B on Node Fir-10 because of a checkpoint-mismatch error on 2026-05-07.", "evidence": [{"doc_id": "doc_000286", "sent_id": 7}, {"doc_id": "doc_000489", "sent_id": 3}], "id": "train_000278", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Mira-8B failed because of an out-of-memory error on 2026-05-29 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000492", "sent_id": 8}], "id": "train_000279", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-12 received 3 GPUs from Node Spruce-03 on 2026-04-18.", "evidence": [{"doc_id": "doc_000151", "sent_id": 7}, {"doc_id": "doc_000278", "sent_id": 6}], "id": "train_000280", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Atlas-7B for claim classification on 2026-05-10.", "evidence": [{"doc_id": "doc_000489", "sent_id": 4}], "id": "train_000281", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-06-25 that it used calibrated voting and did not use a reward model.", "evidence": [], "id": "train_000282", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Marble-8B on Node Maple-01 because of a missing-index error on 2026-04-16.", "evidence": [{"doc_id": "doc_000269", "sent_id": 3}], "id": "train_000283", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone L1 deadline from 2026-06-05 to 2026-06-09 on 2026-05-20.", "evidence": [{"doc_id": "doc_000071", "sent_id": 5}], "id": "train_000284", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian changed its method from metric smoothing to hard-negative mining on 2026-05-10.", "evidence": [{"doc_id": "doc_000162", "sent_id": 4}], "id": "train_000285", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Finch-8B achieved a higher latency efficiency score than Aster-8B.", "evidence": [{"doc_id": "doc_000137", "sent_id": 2}, {"doc_id": "doc_000493", "sent_id": 7}], "id": "train_000286", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron selected Mira-3B for calibration on 2026-05-29.", "evidence": [{"doc_id": "doc_000241", "sent_id": 8}], "id": "train_000287", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Vela-8B for reranking on 2026-06-18.", "evidence": [{"doc_id": "doc_000309", "sent_id": 4}], "id": "train_000288", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Elm-08 allocated 3 GPUs to Project Nereid on 2026-05-10.", "evidence": [], "id": "train_000289", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian changed its method from calibrated voting to confidence calibration on 2026-06-19.", "evidence": [], "id": "train_000290", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-06-30 moved the Milestone V1 deadline to 2026-06-10 on 2026-05-15.", "evidence": [{"doc_id": "doc_000465", "sent_id": 7}, {"doc_id": "doc_000013", "sent_id": 8}], "id": "train_000291", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from confidence calibration to calibrated voting on 2026-04-01.", "evidence": [{"doc_id": "doc_000281", "sent_id": 4}], "id": "train_000292", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Lattice Memory Study 3 reported on 2026-05-04 that it used hard-negative mining and did not use a reward model.", "evidence": [{"doc_id": "doc_000087", "sent_id": 1}], "id": "train_000293", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Stable Chains Study 1 reported on 2026-04-21 that it used chain verification and did not use a reward model.", "evidence": [], "id": "train_000294", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-7B achieved 0.762 evidence F1 on OrionBench-3 for Project Nereid on 2026-04-18.", "evidence": [{"doc_id": "doc_000271", "sent_id": 6}], "id": "train_000295", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-3B achieved 0.610 accuracy on SignalSet-2 for Project Anchor on 2026-04-21.", "evidence": [{"doc_id": "doc_000026", "sent_id": 7}], "id": "train_000296", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Kestrel-3B failed on Node Birch-04 because of a missing-index error on 2026-06-19.", "evidence": [{"doc_id": "doc_000072", "sent_id": 8}], "id": "train_000297", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid selected Nova-7B for reranking on 2026-06-29.", "evidence": [], "id": "train_000298", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor changed its method from query rewriting to sentence pruning on 2026-04-17.", "evidence": [], "id": "train_000299", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared latency efficiency score runs, Mira-7B achieved a higher latency efficiency score than Nova-7B.", "evidence": [{"doc_id": "doc_000077", "sent_id": 3}, {"doc_id": "doc_000320", "sent_id": 7}], "id": "train_000300", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron moved the Milestone X1 deadline from 2026-06-08 to 2026-06-12 on 2026-05-27.", "evidence": [{"doc_id": "doc_000323", "sent_id": 5}], "id": "train_000301", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-3B achieved 0.830 macro F1 on LabQA-3 for Project Aster on 2026-04-21.", "evidence": [], "id": "train_000302", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Helix-7B failed on Node Poplar-12 because of an unstable-validation-loss error on 2026-06-25.", "evidence": [{"doc_id": "doc_000281", "sent_id": 2}], "id": "train_000303", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from threshold search to contrastive tuning on 2026-04-01.", "evidence": [{"doc_id": "doc_000228", "sent_id": 5}], "id": "train_000304", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-05-12 that it used late interaction and did not use a reward model.", "evidence": [], "id": "train_000305", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster moved the Milestone T1 deadline to 2026-06-01 on 2026-05-20.", "evidence": [], "id": "train_000306", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from rank fusion to teacher distillation on 2026-04-24.", "evidence": [], "id": "train_000307", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-05-18 on 2026-04-22.", "evidence": [{"doc_id": "doc_000268", "sent_id": 5}], "id": "train_000308", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-06-22 selected Marble-8B for error analysis on 2026-05-07.", "evidence": [{"doc_id": "doc_000119", "sent_id": 2}, {"doc_id": "doc_000401", "sent_id": 3}], "id": "train_000309", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kestrel-8B achieved 0.687 latency efficiency score on CedarQA for Project Meridian on 2026-05-17.", "evidence": [], "id": "train_000310", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Quiet Retriever Study 4 reported on 2026-06-25 that it used sentence pruning and did not use a reward model.", "evidence": [], "id": "train_000311", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-05-28 on 2026-05-06.", "evidence": [{"doc_id": "doc_000307", "sent_id": 5}], "id": "train_000312", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Helix-7B achieved a higher evidence F1 than Lumen-7B.", "evidence": [{"doc_id": "doc_000252", "sent_id": 7}, {"doc_id": "doc_000002", "sent_id": 2}], "id": "train_000313", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian selected Cedar-8B for error analysis on 2026-04-04.", "evidence": [], "id": "train_000314", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Quartz-3B achieved 0.607 accuracy on CedarQA-2 for Project Saffron on 2026-06-28.", "evidence": [], "id": "train_000315", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-04-07 selected Cedar-8B for error analysis on 2026-04-01.", "evidence": [{"doc_id": "doc_000119", "sent_id": 9}, {"doc_id": "doc_000131", "sent_id": 5}], "id": "train_000316", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-3B achieved 0.863 accuracy on MemoTrace-3 for Project Anchor on 2026-06-20.", "evidence": [{"doc_id": "doc_000376", "sent_id": 6}], "id": "train_000317", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid moved the Milestone P1 deadline to 2026-07-21 on 2026-06-28.", "evidence": [], "id": "train_000318", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata had a failed run with Mira-7B on Node Sycamore-13 because of an out-of-memory error on 2026-05-21.", "evidence": [{"doc_id": "doc_000471", "sent_id": 3}], "id": "train_000319", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "In the compared accuracy runs, Vela-3B achieved a higher accuracy than Nimbus-3B.", "evidence": [{"doc_id": "doc_000385", "sent_id": 2}, {"doc_id": "doc_000353", "sent_id": 7}], "id": "train_000320", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata changed its method from sentence pruning to query rewriting on 2026-04-08.", "evidence": [{"doc_id": "doc_000097", "sent_id": 5}], "id": "train_000321", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid changed its method from structured prompting to QLoRA adaptation on 2026-05-19.", "evidence": [], "id": "train_000322", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Samir Kwan was assigned as the retrieval owner for Project Sonata on 2026-05-20.", "evidence": [], "id": "train_000323", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Priya Moss was assigned as the data steward for Project Saffron on 2026-06-20.", "evidence": [], "id": "train_000324", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Atlas-7B achieved 0.744 evidence F1 on RiverBench-2 for Project Nereid on 2026-06-24.", "evidence": [], "id": "train_000325", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Atlas-7B for calibration on 2026-04-05.", "evidence": [{"doc_id": "doc_000242", "sent_id": 4}], "id": "train_000326", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-05-07 to 2026-05-13 on 2026-04-22.", "evidence": [], "id": "train_000327", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-3B achieved 0.887 evidence F1 on NereidNotes-3 for Project Nereid on 2026-04-28.", "evidence": [{"doc_id": "doc_000024", "sent_id": 7}], "id": "train_000328", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-05-26 moved the Milestone B2 deadline to 2026-05-01 on 2026-04-17.", "evidence": [{"doc_id": "doc_000101", "sent_id": 7}, {"doc_id": "doc_000047", "sent_id": 8}], "id": "train_000329", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-04-27.", "evidence": [{"doc_id": "doc_000372", "sent_id": 2}], "id": "train_000330", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-05-12 moved the Milestone B1 deadline from 2026-07-10 to 2026-07-18 on 2026-06-24.", "evidence": [{"doc_id": "doc_000394", "sent_id": 7}, {"doc_id": "doc_000118", "sent_id": 5}], "id": "train_000331", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from contrastive tuning to data mixing on 2026-05-27.", "evidence": [{"doc_id": "doc_000391", "sent_id": 5}], "id": "train_000332", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from threshold search to reward reranking on 2026-05-10.", "evidence": [{"doc_id": "doc_000284", "sent_id": 4}], "id": "train_000333", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-04-14 that it used alias expansion and did not use a reward model.", "evidence": [], "id": "train_000334", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-04-14 had a run with Lumen-8B on Node Spruce-03 that failed because of a missing-index error on 2026-05-22.", "evidence": [{"doc_id": "doc_000233", "sent_id": 7}, {"doc_id": "doc_000041", "sent_id": 8}], "id": "train_000335", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian had a failed run with Nimbus-8B on Node Elm-08 because of a checkpoint-mismatch error on 2026-04-03.", "evidence": [{"doc_id": "doc_000049", "sent_id": 8}], "id": "train_000336", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nova-8B achieved 0.896 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-04-21.", "evidence": [{"doc_id": "doc_000278", "sent_id": 7}], "id": "train_000337", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with River-8B failed on Node Rowan-09 because of a missing-index error on 2026-04-10.", "evidence": [{"doc_id": "doc_000295", "sent_id": 8}], "id": "train_000338", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Kestrel-3B achieved a higher latency efficiency score than Mira-3B.", "evidence": [{"doc_id": "doc_000249", "sent_id": 8}, {"doc_id": "doc_000214", "sent_id": 6}], "id": "train_000339", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian changed its method from late interaction to BM25 retrieval on 2026-04-20.", "evidence": [], "id": "train_000340", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected River-7B for evidence retrieval on 2026-06-18.", "evidence": [{"doc_id": "doc_000231", "sent_id": 5}], "id": "train_000341", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Kestrel-3B achieved a higher accuracy than Aster-3B.", "evidence": [{"doc_id": "doc_000229", "sent_id": 7}, {"doc_id": "doc_000261", "sent_id": 2}], "id": "train_000342", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-8B achieved 0.851 evidence F1 on LabQA for Project Nereid on 2026-05-26.", "evidence": [{"doc_id": "doc_000213", "sent_id": 7}], "id": "train_000343", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Selene Kim was assigned as the evaluation owner for Project Anchor on 2026-06-01.", "evidence": [{"doc_id": "doc_000352", "sent_id": 2}], "id": "train_000344", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-06 selected Mira-3B for error analysis on 2026-04-19.", "evidence": [{"doc_id": "doc_000114", "sent_id": 2}, {"doc_id": "doc_000264", "sent_id": 4}], "id": "train_000345", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from rank fusion to teacher distillation on 2026-06-21.", "evidence": [{"doc_id": "doc_000118", "sent_id": 4}], "id": "train_000346", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from late interaction to BM25 retrieval on 2026-05-31.", "evidence": [{"doc_id": "doc_000408", "sent_id": 5}], "id": "train_000347", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-12 selected Lumen-8B for evidence retrieval on 2026-05-27.", "evidence": [{"doc_id": "doc_000165", "sent_id": 7}, {"doc_id": "doc_000266", "sent_id": 5}], "id": "train_000348", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Theo Lind was assigned as the evaluation owner on 2026-05-19 moved the Milestone R1 deadline from 2026-06-22 to 2026-06-26 on 2026-05-29.", "evidence": [{"doc_id": "doc_000023", "sent_id": 7}, {"doc_id": "doc_000090", "sent_id": 8}], "id": "train_000349", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Orchid-8B failed because of a checkpoint-mismatch error on 2026-06-19 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000034", "sent_id": 8}], "id": "train_000350", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Silver Notes Study 3 reported on 2026-06-19 that it used contrastive tuning and did not use a reward model.", "evidence": [{"doc_id": "doc_000301", "sent_id": 7}], "id": "train_000351", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-3B achieved 0.681 accuracy on VestaLogs-2 for Project Meridian on 2026-06-20.", "evidence": [{"doc_id": "doc_000003", "sent_id": 6}], "id": "train_000352", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-19 recorded accuracy for Nimbus-7B on MemoTrace using Node Hazel-14 on 2026-06-29.", "evidence": [{"doc_id": "doc_000016", "sent_id": 4}, {"doc_id": "doc_000040", "sent_id": 2}], "id": "train_000353", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Atlas-7B for error analysis on 2026-06-14.", "evidence": [{"doc_id": "doc_000032", "sent_id": 4}], "id": "train_000354", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Elian Ames was assigned as the evaluation owner on 2026-05-04 selected Nimbus-3B for reranking on 2026-06-10.", "evidence": [{"doc_id": "doc_000018", "sent_id": 2}, {"doc_id": "doc_000065", "sent_id": 5}], "id": "train_000355", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Finch-7B for error analysis on 2026-04-14.", "evidence": [], "id": "train_000356", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-06-06.", "evidence": [], "id": "train_000357", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Cedar-7B achieved a higher macro F1 than Quartz-3B.", "evidence": [{"doc_id": "doc_000400", "sent_id": 6}, {"doc_id": "doc_000298", "sent_id": 6}], "id": "train_000358", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Lumen-7B achieved 0.822 latency efficiency score on MemoTrace-3 for Project Anchor on 2026-06-01.", "evidence": [{"doc_id": "doc_000034", "sent_id": 2}], "id": "train_000359", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lumen-7B achieved 0.604 latency efficiency score on MemoTrace-3 for Project Anchor on 2026-05-26.", "evidence": [{"doc_id": "doc_000232", "sent_id": 7}], "id": "train_000360", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Finch-8B for error analysis on 2026-06-08.", "evidence": [], "id": "train_000361", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-3B achieved 0.852 macro F1 on OrionBench-3 for Project Nereid on 2026-04-27.", "evidence": [], "id": "train_000362", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-05-30 on 2026-04-29.", "evidence": [], "id": "train_000363", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nora Sol was assigned as the retrieval owner for Project Nereid on 2026-05-04.", "evidence": [{"doc_id": "doc_000448", "sent_id": 2}], "id": "train_000364", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Fir-10 allocated 5 GPUs to Project Sonata on 2026-06-20.", "evidence": [{"doc_id": "doc_000032", "sent_id": 6}], "id": "train_000365", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Elm-08 allocated 1 GPU to Project Nereid on 2026-06-25.", "evidence": [], "id": "train_000366", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-05-21 to 2026-05-27 on 2026-04-29.", "evidence": [{"doc_id": "doc_000298", "sent_id": 5}], "id": "train_000367", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-7B for evidence retrieval on 2026-06-18.", "evidence": [{"doc_id": "doc_000118", "sent_id": 3}], "id": "train_000368", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-20 received 3 GPUs from Node Elm-08 on 2026-04-04.", "evidence": [{"doc_id": "doc_000004", "sent_id": 2}, {"doc_id": "doc_000150", "sent_id": 6}], "id": "train_000369", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Finch-8B achieved a higher evidence F1 than Vela-8B.", "evidence": [{"doc_id": "doc_000234", "sent_id": 6}, {"doc_id": "doc_000004", "sent_id": 7}], "id": "train_000370", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Talia Marin was assigned as the lead on 2026-06-08 selected Quartz-7B for error analysis on 2026-04-10.", "evidence": [{"doc_id": "doc_000259", "sent_id": 2}, {"doc_id": "doc_000308", "sent_id": 10}], "id": "train_000371", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-06-23 on 2026-05-20.", "evidence": [{"doc_id": "doc_000214", "sent_id": 5}], "id": "train_000372", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor selected Orchid-8B for reranking on 2026-04-10.", "evidence": [{"doc_id": "doc_000355", "sent_id": 10}], "id": "train_000373", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nora Bauer was assigned as the lead for Project Nereid on 2026-05-11.", "evidence": [{"doc_id": "doc_000030", "sent_id": 2}], "id": "train_000374", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-04-04 that it used data mixing and did not use a reward model.", "evidence": [], "id": "train_000375", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Julian Gray was assigned as the lead for Project Nereid on 2026-04-15.", "evidence": [], "id": "train_000376", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Iris Lane was assigned as the evaluation owner for Project Anchor on 2026-04-28.", "evidence": [{"doc_id": "doc_000073", "sent_id": 7}], "id": "train_000377", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-05-11 changed its method from BM25 retrieval to evidence pooling on 2026-06-14.", "evidence": [{"doc_id": "doc_000090", "sent_id": 2}, {"doc_id": "doc_000003", "sent_id": 4}], "id": "train_000378", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Vela-7B achieved 0.696 accuracy on LumenFacts-3 for Project Saffron on 2026-05-05.", "evidence": [{"doc_id": "doc_000423", "sent_id": 7}], "id": "train_000379", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Mira-7B achieved 0.583 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-04-08.", "evidence": [], "id": "train_000380", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-05-21 that it used hard-negative mining and did not use a reward model.", "evidence": [], "id": "train_000381", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-06-22 received 4 GPUs from Node Laurel-11 on 2026-05-23.", "evidence": [{"doc_id": "doc_000103", "sent_id": 2}, {"doc_id": "doc_000009", "sent_id": 6}], "id": "train_000382", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-06-01 received 1 GPU from Node Laurel-11 on 2026-05-14.", "evidence": [{"doc_id": "doc_000461", "sent_id": 2}, {"doc_id": "doc_000136", "sent_id": 3}], "id": "train_000383", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from confidence calibration to chain verification on 2026-04-26.", "evidence": [{"doc_id": "doc_000238", "sent_id": 4}], "id": "train_000384", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Quartz-3B achieved 0.768 accuracy on CedarQA-2 for Project Saffron on 2026-06-09.", "evidence": [{"doc_id": "doc_000128", "sent_id": 7}], "id": "train_000385", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from teacher distillation to query rewriting on 2026-04-12.", "evidence": [{"doc_id": "doc_000092", "sent_id": 4}], "id": "train_000386", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from BM25 retrieval to late interaction on 2026-06-24.", "evidence": [{"doc_id": "doc_000089", "sent_id": 4}], "id": "train_000387", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Kestrel-8B failed on Node Birch-04 because of a missing-index error on 2026-05-01.", "evidence": [{"doc_id": "doc_000398", "sent_id": 8}], "id": "train_000388", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from alias expansion to dense retrieval on 2026-05-05.", "evidence": [], "id": "train_000389", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Pale Compass Study 2 reported on 2026-06-04 that it used contrastive tuning and did not use a reward model.", "evidence": [], "id": "train_000390", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-06-08 changed its method from hybrid retrieval to QLoRA adaptation on 2026-06-20.", "evidence": [{"doc_id": "doc_000204", "sent_id": 2}, {"doc_id": "doc_000494", "sent_id": 6}], "id": "train_000391", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Aster-8B for evidence retrieval on 2026-05-29.", "evidence": [{"doc_id": "doc_000071", "sent_id": 8}], "id": "train_000392", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Atlas-7B failed because of a missing-index error on 2026-05-07 while using Node Poplar-12.", "evidence": [{"doc_id": "doc_000223", "sent_id": 3}], "id": "train_000393", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Finch-3B for error analysis on 2026-04-04.", "evidence": [], "id": "train_000394", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-02 received 6 GPUs from Node Birch-04 on 2026-06-10.", "evidence": [{"doc_id": "doc_000256", "sent_id": 7}, {"doc_id": "doc_000410", "sent_id": 4}], "id": "train_000395", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from late interaction to cross-encoder reranking on 2026-04-15.", "evidence": [{"doc_id": "doc_000278", "sent_id": 5}], "id": "train_000396", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Marble-8B achieved a higher macro F1 than Quartz-8B.", "evidence": [{"doc_id": "doc_000137", "sent_id": 7}, {"doc_id": "doc_000229", "sent_id": 2}], "id": "train_000397", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Mira Nolan was assigned as the evaluation owner for Project Meridian on 2026-06-08.", "evidence": [{"doc_id": "doc_000405", "sent_id": 2}], "id": "train_000398", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-04-08.", "evidence": [], "id": "train_000399", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Nimbus-7B achieved a higher accuracy than Kestrel-7B.", "evidence": [{"doc_id": "doc_000040", "sent_id": 2}, {"doc_id": "doc_000257", "sent_id": 7}], "id": "train_000400", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid moved the Milestone B2 deadline from 2026-05-03 to 2026-04-24 on 2026-04-15.", "evidence": [{"doc_id": "doc_000190", "sent_id": 5}], "id": "train_000401", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Aster-8B achieved a higher latency efficiency score than Mira-8B.", "evidence": [{"doc_id": "doc_000125", "sent_id": 2}, {"doc_id": "doc_000002", "sent_id": 7}], "id": "train_000402", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from threshold search to contrastive tuning on 2026-06-14.", "evidence": [{"doc_id": "doc_000062", "sent_id": 4}], "id": "train_000403", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone T1 deadline to 2026-04-19 on 2026-04-08.", "evidence": [{"doc_id": "doc_000414", "sent_id": 6}], "id": "train_000404", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-06-01 selected River-8B for evidence retrieval on 2026-06-24.", "evidence": [{"doc_id": "doc_000435", "sent_id": 2}, {"doc_id": "doc_000083", "sent_id": 5}], "id": "train_000405", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Nimbus-8B failed on Node Elm-08 because of a missing-index error on 2026-05-15.", "evidence": [{"doc_id": "doc_000215", "sent_id": 8}], "id": "train_000406", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-04-20 selected Vela-3B for error analysis on 2026-04-29.", "evidence": [{"doc_id": "doc_000346", "sent_id": 2}, {"doc_id": "doc_000498", "sent_id": 5}], "id": "train_000407", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Stable Chains Study 3 reported on 2026-06-20 that it used BM25 retrieval and did not use a reward model.", "evidence": [], "id": "train_000408", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron moved the Milestone D2 deadline from 2026-05-19 to 2026-05-27 on 2026-04-29.", "evidence": [{"doc_id": "doc_000346", "sent_id": 5}], "id": "train_000409", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Atlas-8B achieved 0.714 evidence F1 on RiverBench-2 for Project Nereid on 2026-04-26.", "evidence": [], "id": "train_000410", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-7B achieved 0.693 evidence F1 on OrionBench for Project Sonata on 2026-04-07.", "evidence": [{"doc_id": "doc_000295", "sent_id": 7}], "id": "train_000411", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Quartz-3B achieved 0.648 latency efficiency score on CedarQA-2 for Project Saffron on 2026-05-25.", "evidence": [{"doc_id": "doc_000207", "sent_id": 2}], "id": "train_000412", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-04-14 selected Marble-7B for error analysis on 2026-06-16.", "evidence": [{"doc_id": "doc_000209", "sent_id": 7}, {"doc_id": "doc_000172", "sent_id": 6}], "id": "train_000413", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Fir-10 allocated 6 GPUs to Project Saffron on 2026-05-02.", "evidence": [{"doc_id": "doc_000416", "sent_id": 6}], "id": "train_000414", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata selected Marble-3B for error analysis on 2026-06-25.", "evidence": [{"doc_id": "doc_000103", "sent_id": 4}], "id": "train_000415", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from reward reranking to threshold search on 2026-05-13.", "evidence": [{"doc_id": "doc_000489", "sent_id": 5}], "id": "train_000416", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mara Lane was assigned as the retrieval owner for Project Sonata on 2026-04-20.", "evidence": [{"doc_id": "doc_000346", "sent_id": 2}], "id": "train_000417", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Aspen-01 allocated 5 GPUs to Project Nereid on 2026-05-16.", "evidence": [{"doc_id": "doc_000223", "sent_id": 6}], "id": "train_000418", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mina Torres was assigned as the data steward for Project Meridian on 2026-05-12.", "evidence": [{"doc_id": "doc_000015", "sent_id": 7}], "id": "train_000419", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from alias expansion to dense retrieval on 2026-04-06.", "evidence": [], "id": "train_000420", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid selected Orchid-8B for reranking on 2026-05-31.", "evidence": [{"doc_id": "doc_000140", "sent_id": 4}], "id": "train_000421", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mira-7B achieved 0.744 accuracy on MemoTrace-2 for Project Saffron on 2026-04-28.", "evidence": [{"doc_id": "doc_000183", "sent_id": 6}], "id": "train_000422", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian moved the Milestone J1 deadline from 2026-04-25 to 2026-05-01 on 2026-04-15.", "evidence": [{"doc_id": "doc_000061", "sent_id": 5}], "id": "train_000423", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from reward reranking to data mixing on 2026-04-27.", "evidence": [], "id": "train_000424", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-26 recorded latency efficiency score for Aster-3B on SignalSet using Node Cedar-02 on 2026-04-11.", "evidence": [{"doc_id": "doc_000286", "sent_id": 7}, {"doc_id": "doc_000292", "sent_id": 6}], "id": "train_000425", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Lumen-7B achieved a higher latency efficiency score than Atlas-7B.", "evidence": [{"doc_id": "doc_000481", "sent_id": 6}, {"doc_id": "doc_000452", "sent_id": 6}], "id": "train_000426", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from data mixing to contrastive tuning on 2026-04-12.", "evidence": [{"doc_id": "doc_000061", "sent_id": 4}], "id": "train_000427", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Talia Reed was assigned as the retrieval owner for Project Aster on 2026-06-15.", "evidence": [{"doc_id": "doc_000118", "sent_id": 2}], "id": "train_000428", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Atlas-7B for evidence retrieval on 2026-05-31.", "evidence": [{"doc_id": "doc_000474", "sent_id": 4}], "id": "train_000429", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Orchid-3B achieved a higher accuracy than River-3B.", "evidence": [{"doc_id": "doc_000488", "sent_id": 1}, {"doc_id": "doc_000026", "sent_id": 7}], "id": "train_000430", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron moved the Milestone D2 deadline from 2026-06-14 to 2026-06-24 on 2026-06-03.", "evidence": [], "id": "train_000431", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster had a failed run with Cedar-8B on Node Juniper-06 because of an unstable-validation-loss error on 2026-06-25.", "evidence": [{"doc_id": "doc_000139", "sent_id": 3}], "id": "train_000432", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Willow-05 allocated 4 GPUs to Project Saffron on 2026-05-31.", "evidence": [], "id": "train_000433", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Ravi Reed was assigned as the evaluation owner for Project Meridian on 2026-06-15.", "evidence": [{"doc_id": "doc_000369", "sent_id": 2}], "id": "train_000434", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-3B achieved 0.884 evidence F1 on RiverBench for Project Aster on 2026-04-04.", "evidence": [{"doc_id": "doc_000142", "sent_id": 6}], "id": "train_000435", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-04-14 moved the Milestone Z1 deadline to 2026-07-01 on 2026-06-10.", "evidence": [{"doc_id": "doc_000047", "sent_id": 7}, {"doc_id": "doc_000352", "sent_id": 5}], "id": "train_000436", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-3B achieved 0.946 accuracy on LumenFacts for Project Anchor on 2026-06-15.", "evidence": [{"doc_id": "doc_000488", "sent_id": 1}], "id": "train_000437", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from rank fusion to dense retrieval on 2026-04-29.", "evidence": [{"doc_id": "doc_000397", "sent_id": 5}], "id": "train_000438", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron had a failed run with Quartz-7B on Node Pine-07 because of a missing-index error on 2026-05-15.", "evidence": [{"doc_id": "doc_000385", "sent_id": 8}], "id": "train_000439", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nova-8B for reranking on 2026-04-30.", "evidence": [{"doc_id": "doc_000400", "sent_id": 3}], "id": "train_000440", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid had a failed run with Vela-7B on Node Laurel-11 because of an unstable-validation-loss error on 2026-04-24.", "evidence": [{"doc_id": "doc_000026", "sent_id": 8}], "id": "train_000441", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Cedar-3B achieved a higher accuracy than River-3B.", "evidence": [{"doc_id": "doc_000130", "sent_id": 2}, {"doc_id": "doc_000026", "sent_id": 7}], "id": "train_000442", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Kestrel-7B achieved 0.758 latency efficiency score on CedarQA for Project Meridian on 2026-04-20.", "evidence": [{"doc_id": "doc_000416", "sent_id": 2}], "id": "train_000443", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-26 recorded evidence F1 for Kestrel-8B on OrionBench-3 using Node Birch-04 on 2026-05-03.", "evidence": [{"doc_id": "doc_000241", "sent_id": 7}, {"doc_id": "doc_000305", "sent_id": 4}], "id": "train_000444", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-3B achieved 0.815 accuracy on LumenFacts for Project Anchor on 2026-06-28.", "evidence": [], "id": "train_000445", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from reward reranking to threshold search on 2026-06-24.", "evidence": [{"doc_id": "doc_000108", "sent_id": 5}], "id": "train_000446", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 6 GPUs to Project Saffron on 2026-06-06.", "evidence": [{"doc_id": "doc_000413", "sent_id": 5}], "id": "train_000447", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from document chunking to metric smoothing on 2026-05-31.", "evidence": [{"doc_id": "doc_000262", "sent_id": 4}], "id": "train_000448", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Fir-10 allocated 1 GPU to Project Sonata on 2026-05-09.", "evidence": [], "id": "train_000449", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Lumen-7B on Node Fir-10 because of a checkpoint-mismatch error on 2026-05-01.", "evidence": [{"doc_id": "doc_000479", "sent_id": 8}], "id": "train_000450", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-05-21 that it used alias expansion and did not use a reward model.", "evidence": [], "id": "train_000451", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Mina Torres was assigned as the retrieval owner for Project Sonata on 2026-06-02.", "evidence": [{"doc_id": "doc_000417", "sent_id": 7}], "id": "train_000452", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Cedar-8B for error analysis on 2026-05-08.", "evidence": [{"doc_id": "doc_000298", "sent_id": 8}], "id": "train_000453", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-03 recorded macro F1 for Lumen-7B on TraceEval using Node Spruce-03 on 2026-05-09.", "evidence": [{"doc_id": "doc_000485", "sent_id": 3}, {"doc_id": "doc_000462", "sent_id": 6}], "id": "train_000454", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from QLoRA adaptation to hybrid retrieval on 2026-04-05.", "evidence": [{"doc_id": "doc_000070", "sent_id": 4}], "id": "train_000455", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Atlas-7B for reranking on 2026-06-21.", "evidence": [{"doc_id": "doc_000049", "sent_id": 4}], "id": "train_000456", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Lumen-3B achieved 0.833 latency efficiency score on MemoTrace-3 for Project Anchor on 2026-05-19.", "evidence": [], "id": "train_000457", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-13 recorded macro F1 for Mira-7B on TraceEval-3 using Node Sycamore-13 on 2026-04-21.", "evidence": [{"doc_id": "doc_000234", "sent_id": 2}, {"doc_id": "doc_000206", "sent_id": 6}], "id": "train_000458", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from confidence calibration to chain verification on 2026-06-29.", "evidence": [], "id": "train_000459", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Atlas-7B for calibration on 2026-06-05.", "evidence": [{"doc_id": "doc_000417", "sent_id": 8}], "id": "train_000460", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Vela-3B on Node Fir-10 because of a checkpoint-mismatch error on 2026-05-21.", "evidence": [{"doc_id": "doc_000391", "sent_id": 3}], "id": "train_000461", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from query rewriting to teacher distillation on 2026-06-17.", "evidence": [{"doc_id": "doc_000032", "sent_id": 5}], "id": "train_000462", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-05-26 moved the Milestone P1 deadline to 2026-05-26 on 2026-05-18.", "evidence": [{"doc_id": "doc_000101", "sent_id": 7}, {"doc_id": "doc_000460", "sent_id": 2}], "id": "train_000463", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-06-15 selected River-3B for error analysis on 2026-05-03.", "evidence": [{"doc_id": "doc_000193", "sent_id": 2}, {"doc_id": "doc_000252", "sent_id": 4}], "id": "train_000464", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from hybrid retrieval to document chunking on 2026-04-12.", "evidence": [{"doc_id": "doc_000271", "sent_id": 4}], "id": "train_000465", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Willow-05 allocated 2 GPUs to Project Saffron on 2026-05-02.", "evidence": [], "id": "train_000466", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from BM25 retrieval to late interaction on 2026-05-13.", "evidence": [{"doc_id": "doc_000223", "sent_id": 5}], "id": "train_000467", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Nimbus-7B achieved a higher macro F1 than Vela-7B.", "evidence": [{"doc_id": "doc_000479", "sent_id": 7}, {"doc_id": "doc_000467", "sent_id": 2}], "id": "train_000468", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron selected Mira-8B for calibration on 2026-05-16.", "evidence": [], "id": "train_000469", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-04-17 that it used query rewriting and used a reward model.", "evidence": [{"doc_id": "doc_000205", "sent_id": 8}], "id": "train_000470", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-04-27 changed its method from calibrated voting to confidence calibration on 2026-06-28.", "evidence": [{"doc_id": "doc_000015", "sent_id": 2}, {"doc_id": "doc_000483", "sent_id": 4}], "id": "train_000471", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-8B for reranking on 2026-06-05.", "evidence": [{"doc_id": "doc_000290", "sent_id": 8}], "id": "train_000472", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-8B achieved 0.847 accuracy on CedarQA-2 for Project Saffron on 2026-04-15.", "evidence": [], "id": "train_000473", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Quartz-7B achieved a higher latency efficiency score than Mira-8B.", "evidence": [{"doc_id": "doc_000067", "sent_id": 7}, {"doc_id": "doc_000173", "sent_id": 2}], "id": "train_000474", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-04-07 recorded latency efficiency score for Nova-7B on LumenFacts-2 using Node Poplar-12 on 2026-04-25.", "evidence": [{"doc_id": "doc_000300", "sent_id": 7}, {"doc_id": "doc_000268", "sent_id": 6}], "id": "train_000475", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Marble-3B failed on Node Maple-01 because of an out-of-memory error on 2026-05-29.", "evidence": [{"doc_id": "doc_000433", "sent_id": 8}], "id": "train_000476", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-11 changed its method from sentence pruning to query rewriting on 2026-06-04.", "evidence": [{"doc_id": "doc_000319", "sent_id": 2}, {"doc_id": "doc_000451", "sent_id": 3}], "id": "train_000477", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nora Bauer was assigned as the lead for Project Nereid on 2026-05-19.", "evidence": [{"doc_id": "doc_000162", "sent_id": 7}], "id": "train_000478", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 2 GPUs to Project Meridian on 2026-05-23.", "evidence": [], "id": "train_000479", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Aster-8B for evidence retrieval on 2026-06-04.", "evidence": [{"doc_id": "doc_000169", "sent_id": 3}], "id": "train_000480", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Kestrel-3B for claim classification on 2026-06-05.", "evidence": [{"doc_id": "doc_000177", "sent_id": 8}], "id": "train_000481", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-8B achieved 0.801 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-06-10.", "evidence": [], "id": "train_000482", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Vela-3B on Node Fir-10 because of a checkpoint-mismatch error on 2026-05-16.", "evidence": [], "id": "train_000483", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-06 selected Nova-8B for calibration on 2026-04-01.", "evidence": [{"doc_id": "doc_000114", "sent_id": 2}, {"doc_id": "doc_000310", "sent_id": 5}], "id": "train_000484", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-04 received 6 GPUs from Node Spruce-03 on 2026-04-08.", "evidence": [{"doc_id": "doc_000316", "sent_id": 2}, {"doc_id": "doc_000343", "sent_id": 4}], "id": "train_000485", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-04-20 changed its method from QLoRA adaptation to hybrid retrieval on 2026-05-03.", "evidence": [{"doc_id": "doc_000368", "sent_id": 2}, {"doc_id": "doc_000015", "sent_id": 4}], "id": "train_000486", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Quartz-3B for claim classification on 2026-06-12.", "evidence": [{"doc_id": "doc_000098", "sent_id": 8}], "id": "train_000487", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Kestrel-3B achieved 0.833 evidence F1 on OrionBench-3 for Project Nereid on 2026-06-22.", "evidence": [{"doc_id": "doc_000422", "sent_id": 2}], "id": "train_000488", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Cedar-02 allocated 4 GPUs to Project Anchor on 2026-06-13.", "evidence": [{"doc_id": "doc_000430", "sent_id": 6}], "id": "train_000489", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid selected Atlas-8B for evidence retrieval on 2026-06-08.", "evidence": [], "id": "train_000490", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-05-18 received 3 GPUs from Node Sycamore-13 on 2026-06-13.", "evidence": [{"doc_id": "doc_000256", "sent_id": 2}, {"doc_id": "doc_000072", "sent_id": 6}], "id": "train_000491", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Noah Vale was assigned as the lead for Project Aster on 2026-06-23.", "evidence": [{"doc_id": "doc_000427", "sent_id": 7}], "id": "train_000492", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Finch-8B achieved 0.716 macro F1 on NereidNotes-2 for Project Aster on 2026-05-06.", "evidence": [], "id": "train_000493", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Willow-05 allocated 1 GPU to Project Sonata on 2026-05-29.", "evidence": [], "id": "train_000494", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-29 moved the Milestone J1 deadline from 2026-05-27 to 2026-05-31 on 2026-05-07.", "evidence": [{"doc_id": "doc_000047", "sent_id": 2}, {"doc_id": "doc_000023", "sent_id": 3}], "id": "train_000495", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-06-02 changed its method from document chunking to hybrid retrieval on 2026-05-03.", "evidence": [{"doc_id": "doc_000417", "sent_id": 7}, {"doc_id": "doc_000272", "sent_id": 3}], "id": "train_000496", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-04-21 moved the Milestone F1 deadline to 2026-06-10 on 2026-05-20.", "evidence": [{"doc_id": "doc_000145", "sent_id": 7}, {"doc_id": "doc_000241", "sent_id": 5}], "id": "train_000497", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-06-26 to 2026-07-04 on 2026-06-01.", "evidence": [], "id": "train_000498", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-8B achieved 0.550 macro F1 on LabQA-2 for Project Sonata on 2026-04-11.", "evidence": [{"doc_id": "doc_000374", "sent_id": 6}], "id": "train_000499", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from late interaction to cross-encoder reranking on 2026-05-13.", "evidence": [{"doc_id": "doc_000450", "sent_id": 5}], "id": "train_000500", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Vela-8B achieved a higher latency efficiency score than Helix-8B.", "evidence": [{"doc_id": "doc_000176", "sent_id": 7}, {"doc_id": "doc_000240", "sent_id": 2}], "id": "train_000501", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-06-11 that it used query rewriting and did not use a reward model.", "evidence": [], "id": "train_000502", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Aster-7B achieved a higher accuracy than Helix-7B.", "evidence": [{"doc_id": "doc_000263", "sent_id": 6}, {"doc_id": "doc_000279", "sent_id": 3}], "id": "train_000503", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-05-11 that it used late interaction and did not use a reward model.", "evidence": [{"doc_id": "doc_000149", "sent_id": 2}], "id": "train_000504", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-8B for evidence retrieval on 2026-05-01.", "evidence": [{"doc_id": "doc_000268", "sent_id": 8}], "id": "train_000505", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Evan Moss was assigned as the lead for Project Nereid on 2026-06-30.", "evidence": [{"doc_id": "doc_000288", "sent_id": 10}], "id": "train_000506", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 5 GPUs to Project Aster on 2026-06-06.", "evidence": [{"doc_id": "doc_000140", "sent_id": 6}], "id": "train_000507", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Vela-8B achieved a higher latency efficiency score than Kestrel-8B.", "evidence": [{"doc_id": "doc_000429", "sent_id": 3}, {"doc_id": "doc_000443", "sent_id": 7}], "id": "train_000508", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid moved the Milestone D1 deadline to 2026-04-27 on 2026-04-01.", "evidence": [{"doc_id": "doc_000103", "sent_id": 7}], "id": "train_000509", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor selected Orchid-7B for reranking on 2026-06-27.", "evidence": [], "id": "train_000510", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron changed its method from evidence pooling to calibrated voting on 2026-06-05.", "evidence": [], "id": "train_000511", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, River-8B achieved a higher evidence F1 than Nova-8B.", "evidence": [{"doc_id": "doc_000031", "sent_id": 6}, {"doc_id": "doc_000044", "sent_id": 6}], "id": "train_000512", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared evidence F1 runs, Orchid-7B achieved a higher evidence F1 than Lumen-7B.", "evidence": [{"doc_id": "doc_000410", "sent_id": 3}, {"doc_id": "doc_000168", "sent_id": 7}], "id": "train_000513", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Rohan Singh was assigned as the retrieval owner for Project Sonata on 2026-04-13.", "evidence": [{"doc_id": "doc_000234", "sent_id": 2}], "id": "train_000514", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Lumen-8B achieved a higher accuracy than Quartz-8B.", "evidence": [{"doc_id": "doc_000098", "sent_id": 6}, {"doc_id": "doc_000459", "sent_id": 6}], "id": "train_000515", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-06-16 changed its method from QLoRA adaptation to structured prompting on 2026-04-19.", "evidence": [{"doc_id": "doc_000435", "sent_id": 7}, {"doc_id": "doc_000126", "sent_id": 3}], "id": "train_000516", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone D1 deadline to 2026-05-20 on 2026-04-25.", "evidence": [], "id": "train_000517", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nova-8B for reranking on 2026-05-21.", "evidence": [{"doc_id": "doc_000290", "sent_id": 3}], "id": "train_000518", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-08 changed its method from evidence pooling to BM25 retrieval on 2026-06-10.", "evidence": [{"doc_id": "doc_000062", "sent_id": 2}, {"doc_id": "doc_000334", "sent_id": 5}], "id": "train_000519", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nova-3B achieved 0.791 macro F1 on LabQA for Project Nereid on 2026-05-25.", "evidence": [{"doc_id": "doc_000474", "sent_id": 2}], "id": "train_000520", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from temporal filtering to alias expansion on 2026-06-14.", "evidence": [{"doc_id": "doc_000376", "sent_id": 4}], "id": "train_000521", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Quiet Retriever Study 1 reported on 2026-05-30 that it used structured prompting and did not use a reward model.", "evidence": [], "id": "train_000522", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Lumen-3B failed on Node Spruce-03 because of an unstable-validation-loss error on 2026-04-11.", "evidence": [], "id": "train_000523", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Atlas-7B for calibration on 2026-06-19.", "evidence": [{"doc_id": "doc_000093", "sent_id": 8}], "id": "train_000524", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-04-27 on 2026-04-15.", "evidence": [], "id": "train_000525", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Finch-3B for error analysis on 2026-04-27.", "evidence": [], "id": "train_000526", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared latency efficiency score runs, Atlas-3B achieved a higher latency efficiency score than Kestrel-8B.", "evidence": [{"doc_id": "doc_000146", "sent_id": 4}, {"doc_id": "doc_000443", "sent_id": 7}], "id": "train_000527", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-19 recorded latency efficiency score for Lumen-7B on MemoTrace-3 using Node Sycamore-13 on 2026-06-01.", "evidence": [{"doc_id": "doc_000495", "sent_id": 6}, {"doc_id": "doc_000034", "sent_id": 2}], "id": "train_000528", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-06-16 changed its method from query rewriting to sentence pruning on 2026-05-24.", "evidence": [{"doc_id": "doc_000093", "sent_id": 7}, {"doc_id": "doc_000403", "sent_id": 4}], "id": "train_000529", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-05-04 to 2026-05-14 on 2026-04-08.", "evidence": [{"doc_id": "doc_000374", "sent_id": 5}], "id": "train_000530", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from evidence pooling to calibrated voting on 2026-06-15.", "evidence": [], "id": "train_000531", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-07-13 on 2026-06-24.", "evidence": [], "id": "train_000532", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-21 recorded evidence F1 for Cedar-3B on NereidNotes-3 using Node Hazel-14 on 2026-04-28.", "evidence": [{"doc_id": "doc_000340", "sent_id": 3}, {"doc_id": "doc_000024", "sent_id": 7}], "id": "train_000533", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-06-22 moved the Milestone B2 deadline to 2026-05-03 on 2026-04-15.", "evidence": [{"doc_id": "doc_000365", "sent_id": 2}, {"doc_id": "doc_000190", "sent_id": 5}], "id": "train_000534", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Lumen-8B failed on Node Spruce-03 because of a checkpoint-mismatch error on 2026-05-28.", "evidence": [{"doc_id": "doc_000140", "sent_id": 3}], "id": "train_000535", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-04-06 moved the Milestone H1 deadline from 2026-05-31 to 2026-05-25 on 2026-05-11.", "evidence": [{"doc_id": "doc_000145", "sent_id": 2}, {"doc_id": "doc_000136", "sent_id": 2}], "id": "train_000536", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Marble-7B achieved a higher evidence F1 than River-7B.", "evidence": [{"doc_id": "doc_000429", "sent_id": 6}, {"doc_id": "doc_000314", "sent_id": 4}], "id": "train_000537", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian selected Atlas-3B for evidence retrieval on 2026-06-25.", "evidence": [{"doc_id": "doc_000483", "sent_id": 3}], "id": "train_000538", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Atlas-7B for evidence retrieval on 2026-04-23.", "evidence": [{"doc_id": "doc_000053", "sent_id": 4}], "id": "train_000539", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Quartz-3B failed on Node Pine-07 because of an unstable-validation-loss error on 2026-05-22.", "evidence": [{"doc_id": "doc_000450", "sent_id": 8}], "id": "train_000540", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Marble-8B for error analysis on 2026-06-11.", "evidence": [{"doc_id": "doc_000427", "sent_id": 3}], "id": "train_000541", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Aspen-01 allocated 4 GPUs to Project Meridian on 2026-04-18.", "evidence": [{"doc_id": "doc_000074", "sent_id": 5}], "id": "train_000542", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from calibrated voting to confidence calibration on 2026-04-26.", "evidence": [{"doc_id": "doc_000053", "sent_id": 5}], "id": "train_000543", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Priya Moss was assigned as the data steward for Project Saffron on 2026-04-22.", "evidence": [], "id": "train_000544", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-7B achieved 0.590 evidence F1 on LabQA-3 for Project Aster on 2026-06-08.", "evidence": [], "id": "train_000545", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Cedar-7B achieved 0.782 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-06-01.", "evidence": [{"doc_id": "doc_000334", "sent_id": 2}], "id": "train_000546", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-06-30 that it used threshold search and did not use a reward model.", "evidence": [], "id": "train_000547", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Lena Sol was assigned as the evaluation owner for Project Saffron on 2026-05-18.", "evidence": [{"doc_id": "doc_000403", "sent_id": 2}], "id": "train_000548", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-02 recorded latency efficiency score for Kestrel-7B on CedarQA using Node Birch-04 on 2026-04-14.", "evidence": [{"doc_id": "doc_000256", "sent_id": 7}, {"doc_id": "doc_000335", "sent_id": 6}], "id": "train_000549", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone R1 deadline to 2026-06-21 on 2026-06-03.", "evidence": [{"doc_id": "doc_000262", "sent_id": 5}], "id": "train_000550", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Iris Lane was assigned as the evaluation owner for Project Anchor on 2026-04-20.", "evidence": [{"doc_id": "doc_000053", "sent_id": 2}], "id": "train_000551", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with Finch-7B failed on Node Juniper-06 because of an out-of-memory error on 2026-05-07.", "evidence": [{"doc_id": "doc_000020", "sent_id": 3}], "id": "train_000552", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from confidence calibration to chain verification on 2026-06-14.", "evidence": [{"doc_id": "doc_000427", "sent_id": 4}], "id": "train_000553", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor's run with Lumen-7B failed on Node Spruce-03 because of a checkpoint-mismatch error on 2026-04-11.", "evidence": [{"doc_id": "doc_000146", "sent_id": 6}], "id": "train_000554", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-7B achieved 0.561 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-04-15.", "evidence": [], "id": "train_000555", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Juniper-06 allocated 5 GPUs to Project Aster on 2026-06-13.", "evidence": [{"doc_id": "doc_000218", "sent_id": 6}], "id": "train_000556", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-8B achieved 0.592 evidence F1 on OrionBench for Project Sonata on 2026-05-09.", "evidence": [{"doc_id": "doc_000395", "sent_id": 6}], "id": "train_000557", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Helix-3B failed on Node Poplar-12 because of a checkpoint-mismatch error on 2026-06-27.", "evidence": [], "id": "train_000558", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid changed its method from cross-encoder reranking to sentence pruning on 2026-06-03.", "evidence": [{"doc_id": "doc_000413", "sent_id": 4}], "id": "train_000559", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Orchid-7B achieved a higher latency efficiency score than Finch-8B.", "evidence": [{"doc_id": "doc_000456", "sent_id": 7}, {"doc_id": "doc_000137", "sent_id": 2}], "id": "train_000560", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-06-09 changed its method from contrastive tuning to threshold search on 2026-06-20.", "evidence": [{"doc_id": "doc_000351", "sent_id": 6}, {"doc_id": "doc_000197", "sent_id": 6}], "id": "train_000561", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-06-08 received 4 GPUs from Node Juniper-06 on 2026-05-09.", "evidence": [{"doc_id": "doc_000225", "sent_id": 2}, {"doc_id": "doc_000173", "sent_id": 6}], "id": "train_000562", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-06-02 moved the Milestone B2 deadline to 2026-05-13 on 2026-04-29.", "evidence": [{"doc_id": "doc_000212", "sent_id": 3}, {"doc_id": "doc_000390", "sent_id": 5}], "id": "train_000563", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-7B for calibration on 2026-06-21.", "evidence": [{"doc_id": "doc_000125", "sent_id": 4}], "id": "train_000564", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-04-06 moved the Milestone T1 deadline from 2026-06-20 to 2026-06-30 on 2026-06-10.", "evidence": [{"doc_id": "doc_000145", "sent_id": 2}, {"doc_id": "doc_000463", "sent_id": 5}], "id": "train_000565", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Helix-8B achieved a higher evidence F1 than Marble-8B.", "evidence": [{"doc_id": "doc_000037", "sent_id": 6}, {"doc_id": "doc_000237", "sent_id": 6}], "id": "train_000566", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-3B achieved 0.705 accuracy on LumenFacts-2 for Project Meridian on 2026-05-02.", "evidence": [{"doc_id": "doc_000053", "sent_id": 9}], "id": "train_000567", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from LoRA adaptation to alias expansion on 2026-05-22.", "evidence": [], "id": "train_000568", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-7B for claim classification on 2026-05-01.", "evidence": [{"doc_id": "doc_000289", "sent_id": 8}], "id": "train_000569", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-05-18 to 2026-05-24 on 2026-04-22.", "evidence": [{"doc_id": "doc_000073", "sent_id": 5}], "id": "train_000570", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from data mixing to contrastive tuning on 2026-04-03.", "evidence": [], "id": "train_000571", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Aster-3B achieved 0.634 macro F1 on RiverBench-3 for Project Sonata on 2026-06-07.", "evidence": [], "id": "train_000572", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor's run with Finch-7B failed on Node Juniper-06 because of a missing-index error on 2026-06-14.", "evidence": [], "id": "train_000573", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Vera Torres was assigned as the retrieval owner on 2026-06-02 selected Lumen-8B for reranking on 2026-06-18.", "evidence": [{"doc_id": "doc_000403", "sent_id": 7}, {"doc_id": "doc_000193", "sent_id": 4}], "id": "train_000574", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-8B for reranking on 2026-05-21.", "evidence": [{"doc_id": "doc_000403", "sent_id": 3}], "id": "train_000575", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected River-8B for evidence retrieval on 2026-05-22.", "evidence": [{"doc_id": "doc_000316", "sent_id": 9}], "id": "train_000576", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 1 reported on 2026-05-26 that it used dense retrieval and did not use a reward model.", "evidence": [], "id": "train_000577", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-20 recorded evidence F1 for Atlas-3B on RiverBench-2 using Node Hazel-14 on 2026-06-15.", "evidence": [{"doc_id": "doc_000004", "sent_id": 2}, {"doc_id": "doc_000120", "sent_id": 2}], "id": "train_000578", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Vela-3B achieved a higher latency efficiency score than Aster-3B.", "evidence": [{"doc_id": "doc_000187", "sent_id": 2}, {"doc_id": "doc_000192", "sent_id": 7}], "id": "train_000579", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Lattice Memory Study 1 reported on 2026-06-12 that it used cross-encoder reranking and did not use a reward model.", "evidence": [{"doc_id": "doc_000354", "sent_id": 8}], "id": "train_000580", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nimbus-3B for calibration on 2026-04-30.", "evidence": [{"doc_id": "doc_000412", "sent_id": 3}], "id": "train_000581", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Kestrel-8B failed because of an out-of-memory error on 2026-06-05 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000456", "sent_id": 8}], "id": "train_000582", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Finch-7B achieved a higher macro F1 than Aster-7B.", "evidence": [{"doc_id": "doc_000201", "sent_id": 2}, {"doc_id": "doc_000055", "sent_id": 7}], "id": "train_000583", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Mira-3B achieved 0.684 accuracy on MemoTrace-2 for Project Saffron on 2026-04-14.", "evidence": [{"doc_id": "doc_000373", "sent_id": 7}], "id": "train_000584", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Lena Sol was assigned as the evaluation owner for Project Saffron on 2026-05-12.", "evidence": [{"doc_id": "doc_000400", "sent_id": 7}], "id": "train_000585", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian had a failed run with River-3B on Node Elm-08 because of a checkpoint-mismatch error on 2026-06-25.", "evidence": [{"doc_id": "doc_000253", "sent_id": 3}], "id": "train_000586", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-06-30 selected Nimbus-3B for calibration on 2026-06-07.", "evidence": [{"doc_id": "doc_000193", "sent_id": 8}, {"doc_id": "doc_000007", "sent_id": 4}], "id": "train_000587", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Nimbus-7B for calibration on 2026-04-23.", "evidence": [{"doc_id": "doc_000285", "sent_id": 3}], "id": "train_000588", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-04-27 moved the Milestone B2 deadline from 2026-05-13 to 2026-05-16 on 2026-04-27.", "evidence": [{"doc_id": "doc_000394", "sent_id": 2}, {"doc_id": "doc_000485", "sent_id": 1}], "id": "train_000589", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-05-19 on 2026-05-13.", "evidence": [{"doc_id": "doc_000316", "sent_id": 6}], "id": "train_000590", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Marble-3B for error analysis on 2026-05-15.", "evidence": [{"doc_id": "doc_000307", "sent_id": 8}], "id": "train_000591", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Poplar-12 allocated 4 GPUs to Project Anchor on 2026-04-04.", "evidence": [{"doc_id": "doc_000139", "sent_id": 6}], "id": "train_000592", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Spruce-03 allocated 1 GPU to Project Aster on 2026-04-25.", "evidence": [], "id": "train_000593", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Delta Evidence Study 4 reported on 2026-04-01 that it used query rewriting and did not use a reward model.", "evidence": [{"doc_id": "doc_000008", "sent_id": 5}], "id": "train_000594", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira Sato was assigned as the data steward for Project Meridian on 2026-05-26.", "evidence": [{"doc_id": "doc_000071", "sent_id": 7}], "id": "train_000595", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-06-28 on 2026-06-17.", "evidence": [{"doc_id": "doc_000376", "sent_id": 5}], "id": "train_000596", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-06-01 changed its method from alias expansion to LoRA adaptation on 2026-05-07.", "evidence": [{"doc_id": "doc_000435", "sent_id": 2}, {"doc_id": "doc_000087", "sent_id": 2}], "id": "train_000597", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from rank fusion to teacher distillation on 2026-05-04.", "evidence": [], "id": "train_000598", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Nimbus-3B failed because of an out-of-memory error on 2026-05-08 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000387", "sent_id": 8}], "id": "train_000599", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron moved the Milestone D2 deadline from 2026-06-02 to 2026-06-06 on 2026-05-13.", "evidence": [{"doc_id": "doc_000284", "sent_id": 5}], "id": "train_000600", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-05-05 on 2026-04-29.", "evidence": [{"doc_id": "doc_000053", "sent_id": 7}], "id": "train_000601", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-04-07 had a run with Nova-8B that failed because of a missing-index error on 2026-04-03 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000300", "sent_id": 7}, {"doc_id": "doc_000196", "sent_id": 8}], "id": "train_000602", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Petra Gray was assigned as the evaluation owner for Project Saffron on 2026-04-27.", "evidence": [{"doc_id": "doc_000165", "sent_id": 2}], "id": "train_000603", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-05-25 recorded macro F1 for Kestrel-3B on OrionBench-3 using Node Birch-04 on 2026-04-26.", "evidence": [{"doc_id": "doc_000388", "sent_id": 2}, {"doc_id": "doc_000332", "sent_id": 3}], "id": "train_000604", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian had a failed run with Nimbus-3B on Node Hazel-14 because of a checkpoint-mismatch error on 2026-05-08.", "evidence": [{"doc_id": "doc_000416", "sent_id": 8}], "id": "train_000605", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-06-14 on 2026-06-03.", "evidence": [{"doc_id": "doc_000098", "sent_id": 5}], "id": "train_000606", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian moved the Milestone P1 deadline from 2026-05-21 to 2026-05-25 on 2026-05-13.", "evidence": [{"doc_id": "doc_000448", "sent_id": 5}], "id": "train_000607", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-05-26 changed its method from chain verification to structured prompting on 2026-05-28.", "evidence": [{"doc_id": "doc_000101", "sent_id": 7}, {"doc_id": "doc_000409", "sent_id": 3}], "id": "train_000608", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Quartz-7B achieved a higher latency efficiency score than Nova-8B.", "evidence": [{"doc_id": "doc_000135", "sent_id": 2}, {"doc_id": "doc_000278", "sent_id": 7}], "id": "train_000609", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Kestrel-7B achieved 0.717 accuracy on CedarQA for Project Meridian on 2026-05-23.", "evidence": [{"doc_id": "doc_000138", "sent_id": 6}], "id": "train_000610", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-7B achieved 0.711 evidence F1 on LabQA-2 for Project Sonata on 2026-04-06.", "evidence": [{"doc_id": "doc_000278", "sent_id": 2}], "id": "train_000611", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from metric smoothing to document chunking on 2026-05-06.", "evidence": [{"doc_id": "doc_000215", "sent_id": 5}], "id": "train_000612", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Laurel-11 allocated 6 GPUs to Project Meridian on 2026-05-30.", "evidence": [{"doc_id": "doc_000054", "sent_id": 6}], "id": "train_000613", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Vela-8B for reranking on 2026-04-17.", "evidence": [{"doc_id": "doc_000246", "sent_id": 8}], "id": "train_000614", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Lumen-7B achieved a higher accuracy than Finch-7B.", "evidence": [{"doc_id": "doc_000243", "sent_id": 4}, {"doc_id": "doc_000410", "sent_id": 6}], "id": "train_000615", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Mira-3B achieved 0.628 evidence F1 on TraceEval-3 for Project Sonata on 2026-06-16.", "evidence": [], "id": "train_000616", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-05-25 received 5 GPUs from Node Hazel-14 on 2026-05-18.", "evidence": [{"doc_id": "doc_000388", "sent_id": 2}, {"doc_id": "doc_000333", "sent_id": 2}], "id": "train_000617", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Juniper-06 allocated 5 GPUs to Project Aster on 2026-06-27.", "evidence": [{"doc_id": "doc_000108", "sent_id": 6}], "id": "train_000618", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-7B for evidence retrieval on 2026-05-04.", "evidence": [], "id": "train_000619", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Elian Ames was assigned as the evaluation owner for Project Meridian on 2026-06-03.", "evidence": [], "id": "train_000620", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-04-20 moved the Milestone L1 deadline to 2026-05-13 on 2026-05-07.", "evidence": [{"doc_id": "doc_000260", "sent_id": 2}, {"doc_id": "doc_000458", "sent_id": 3}], "id": "train_000621", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Cedar-02 allocated 4 GPUs to Project Anchor on 2026-06-07.", "evidence": [], "id": "train_000622", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from QLoRA adaptation to structured prompting on 2026-04-22.", "evidence": [{"doc_id": "doc_000269", "sent_id": 5}], "id": "train_000623", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Lumen-8B failed on Node Spruce-03 because of an out-of-memory error on 2026-05-22.", "evidence": [{"doc_id": "doc_000041", "sent_id": 8}], "id": "train_000624", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from chain verification to confidence calibration on 2026-06-10.", "evidence": [{"doc_id": "doc_000430", "sent_id": 5}], "id": "train_000625", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor changed its method from LoRA adaptation to alias expansion on 2026-04-20.", "evidence": [], "id": "train_000626", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Atlas-7B for evidence retrieval on 2026-04-19.", "evidence": [{"doc_id": "doc_000398", "sent_id": 4}], "id": "train_000627", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 2 GPUs to Project Anchor on 2026-04-12.", "evidence": [], "id": "train_000628", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from contrastive tuning to threshold search on 2026-05-18.", "evidence": [], "id": "train_000629", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with River-8B failed on Node Rowan-09 because of an out-of-memory error on 2026-04-10.", "evidence": [{"doc_id": "doc_000295", "sent_id": 8}], "id": "train_000630", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-21 received 4 GPUs from Node Sycamore-13 on 2026-04-04.", "evidence": [{"doc_id": "doc_000271", "sent_id": 7}, {"doc_id": "doc_000131", "sent_id": 6}], "id": "train_000631", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Finch-7B achieved 0.877 macro F1 on NereidNotes-2 for Project Aster on 2026-06-08.", "evidence": [{"doc_id": "doc_000201", "sent_id": 2}], "id": "train_000632", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Orchid-3B achieved 0.901 macro F1 on LabQA-3 for Project Aster on 2026-04-20.", "evidence": [{"doc_id": "doc_000423", "sent_id": 2}], "id": "train_000633", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Elm-08 allocated 3 GPUs to Project Nereid on 2026-05-27.", "evidence": [], "id": "train_000634", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-3B for claim classification on 2026-04-23.", "evidence": [{"doc_id": "doc_000346", "sent_id": 3}], "id": "train_000635", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-05-26 changed its method from cross-encoder reranking to late interaction on 2026-04-19.", "evidence": [{"doc_id": "doc_000319", "sent_id": 7}, {"doc_id": "doc_000073", "sent_id": 4}], "id": "train_000636", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from alias expansion to dense retrieval on 2026-05-24.", "evidence": [{"doc_id": "doc_000290", "sent_id": 4}], "id": "train_000637", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nimbus-3B for calibration on 2026-04-09.", "evidence": [{"doc_id": "doc_000019", "sent_id": 3}], "id": "train_000638", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid changed its method from contrastive tuning to threshold search on 2026-04-26.", "evidence": [{"doc_id": "doc_000346", "sent_id": 4}], "id": "train_000639", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-8B achieved 0.827 accuracy on CedarQA-3 for Project Anchor on 2026-04-18.", "evidence": [{"doc_id": "doc_000092", "sent_id": 6}], "id": "train_000640", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-28 recorded evidence F1 for Aster-7B on RiverBench-3 using Node Aspen-01 on 2026-05-12.", "evidence": [{"doc_id": "doc_000159", "sent_id": 6}, {"doc_id": "doc_000230", "sent_id": 7}], "id": "train_000641", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Finch-13B for evidence retrieval on 2026-05-10.", "evidence": [{"doc_id": "doc_000450", "sent_id": 4}], "id": "train_000642", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-06-23 changed its method from evidence pooling to BM25 retrieval on 2026-05-07.", "evidence": [{"doc_id": "doc_000062", "sent_id": 7}, {"doc_id": "doc_000291", "sent_id": 3}], "id": "train_000643", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Kestrel-3B failed because of an out-of-memory error on 2026-06-25 while using Node Willow-05.", "evidence": [{"doc_id": "doc_000393", "sent_id": 3}], "id": "train_000644", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Jonas Nolan was assigned as the retrieval owner for Project Sonata on 2026-06-09.", "evidence": [{"doc_id": "doc_000408", "sent_id": 9}], "id": "train_000645", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Finch-8B on Node Willow-05 because of an out-of-memory error on 2026-04-23.", "evidence": [{"doc_id": "doc_000416", "sent_id": 3}], "id": "train_000646", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron moved the Milestone F1 deadline from 2026-06-25 to 2026-07-03 on 2026-06-17.", "evidence": [{"doc_id": "doc_000031", "sent_id": 5}], "id": "train_000647", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Kestrel-8B achieved a higher accuracy than Mira-8B.", "evidence": [{"doc_id": "doc_000408", "sent_id": 8}, {"doc_id": "doc_000177", "sent_id": 6}], "id": "train_000648", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Quartz-8B for claim classification on 2026-04-30.", "evidence": [{"doc_id": "doc_000372", "sent_id": 3}], "id": "train_000649", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from reward reranking to data mixing on 2026-04-13.", "evidence": [], "id": "train_000650", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-05-09 to 2026-05-13 on 2026-04-26.", "evidence": [], "id": "train_000651", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "River-8B achieved 0.640 latency efficiency score on SignalSet-2 for Project Anchor on 2026-04-28.", "evidence": [{"doc_id": "doc_000269", "sent_id": 7}], "id": "train_000652", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata changed its method from threshold search to contrastive tuning on 2026-05-13.", "evidence": [{"doc_id": "doc_000134", "sent_id": 5}], "id": "train_000653", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Nova-3B for reranking on 2026-05-08.", "evidence": [{"doc_id": "doc_000390", "sent_id": 8}], "id": "train_000654", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian's run with Cedar-3B failed because of a missing-index error on 2026-06-06 while using Node Laurel-11.", "evidence": [], "id": "train_000655", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-7B achieved 0.828 evidence F1 on TraceEval-2 for Project Nereid on 2026-05-20.", "evidence": [], "id": "train_000656", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-05-18 received 3 GPUs from Node Fir-10 on 2026-05-06.", "evidence": [{"doc_id": "doc_000256", "sent_id": 2}, {"doc_id": "doc_000060", "sent_id": 4}], "id": "train_000657", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Darian Hale was assigned as the retrieval owner for Project Nereid on 2026-04-15.", "evidence": [], "id": "train_000658", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-7B achieved 0.849 accuracy on SignalSet-3 for Project Meridian on 2026-06-29.", "evidence": [], "id": "train_000659", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-04-27 received 4 GPUs from Node Sycamore-13 on 2026-05-08.", "evidence": [{"doc_id": "doc_000165", "sent_id": 2}, {"doc_id": "doc_000367", "sent_id": 8}], "id": "train_000660", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Quiet Retriever Study 1 reported on 2026-04-01 that it used dense retrieval and did not use a reward model.", "evidence": [{"doc_id": "doc_000445", "sent_id": 5}], "id": "train_000661", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Iris Lane was assigned as the evaluation owner for Project Anchor on 2026-04-14.", "evidence": [{"doc_id": "doc_000374", "sent_id": 7}], "id": "train_000662", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Elian Ames was assigned as the evaluation owner for Project Meridian on 2026-06-17.", "evidence": [], "id": "train_000663", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Marble-8B for calibration on 2026-06-28.", "evidence": [{"doc_id": "doc_000139", "sent_id": 4}], "id": "train_000664", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Jonas Rios was assigned as the lead on 2026-05-25 selected Mira-7B for calibration on 2026-04-02.", "evidence": [{"doc_id": "doc_000384", "sent_id": 2}, {"doc_id": "doc_000292", "sent_id": 3}], "id": "train_000665", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Aster-8B achieved a higher latency efficiency score than Vela-8B.", "evidence": [{"doc_id": "doc_000493", "sent_id": 7}, {"doc_id": "doc_000450", "sent_id": 2}], "id": "train_000666", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor's run with Helix-3B failed on Node Poplar-12 because of an out-of-memory error on 2026-06-26.", "evidence": [{"doc_id": "doc_000312", "sent_id": 8}], "id": "train_000667", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Willow-05 allocated 6 GPUs to Project Saffron on 2026-06-13.", "evidence": [{"doc_id": "doc_000334", "sent_id": 6}], "id": "train_000668", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-7B achieved 0.512 macro F1 on OrionBench-2 for Project Aster on 2026-06-21.", "evidence": [], "id": "train_000669", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian changed its method from data mixing to contrastive tuning on 2026-06-21.", "evidence": [{"doc_id": "doc_000288", "sent_id": 5}], "id": "train_000670", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-31 recorded accuracy for Helix-3B on CedarQA-3 using Node Poplar-12 on 2026-05-02.", "evidence": [{"doc_id": "doc_000236", "sent_id": 4}, {"doc_id": "doc_000238", "sent_id": 7}], "id": "train_000671", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-3B for reranking on 2026-05-29.", "evidence": [{"doc_id": "doc_000326", "sent_id": 10}], "id": "train_000672", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid moved the Milestone P1 deadline to 2026-04-30 on 2026-04-09.", "evidence": [], "id": "train_000673", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from hybrid retrieval to QLoRA adaptation on 2026-06-10.", "evidence": [{"doc_id": "doc_000072", "sent_id": 5}], "id": "train_000674", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Sycamore-13 allocated 2 GPUs to Project Saffron on 2026-05-03.", "evidence": [], "id": "train_000675", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Vela-7B for reranking on 2026-06-19.", "evidence": [{"doc_id": "doc_000174", "sent_id": 8}], "id": "train_000676", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "River-3B achieved 0.539 accuracy on SignalSet-2 for Project Anchor on 2026-04-22.", "evidence": [], "id": "train_000677", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Helix-7B achieved 0.857 latency efficiency score on CedarQA-3 for Project Anchor on 2026-04-25.", "evidence": [{"doc_id": "doc_000480", "sent_id": 6}], "id": "train_000678", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-05-26 selected Vela-7B for claim classification on 2026-06-24.", "evidence": [{"doc_id": "doc_000138", "sent_id": 7}, {"doc_id": "doc_000293", "sent_id": 5}], "id": "train_000679", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-7B achieved 0.741 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-05-27.", "evidence": [], "id": "train_000680", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-3B achieved 0.565 latency efficiency score on LumenFacts-3 for Project Saffron on 2026-05-30.", "evidence": [{"doc_id": "doc_000043", "sent_id": 6}], "id": "train_000681", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Jonas Rios was assigned as the lead for Project Sonata on 2026-05-19.", "evidence": [{"doc_id": "doc_000117", "sent_id": 7}], "id": "train_000682", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from evidence pooling to calibrated voting on 2026-04-12.", "evidence": [{"doc_id": "doc_000437", "sent_id": 4}], "id": "train_000683", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-05-19 on 2026-05-14.", "evidence": [], "id": "train_000684", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 3 GPUs to Project Aster on 2026-05-10.", "evidence": [], "id": "train_000685", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Lind was assigned as the evaluation owner on 2026-05-19 moved the Milestone F1 deadline to 2026-05-02 on 2026-04-24.", "evidence": [{"doc_id": "doc_000023", "sent_id": 7}, {"doc_id": "doc_000017", "sent_id": 8}], "id": "train_000686", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-04-06 had a run with Finch-7B on Node Juniper-06 that failed because of a checkpoint-mismatch error on 2026-05-07.", "evidence": [{"doc_id": "doc_000145", "sent_id": 2}, {"doc_id": "doc_000020", "sent_id": 3}], "id": "train_000687", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Rowan-09 allocated 4 GPUs to Project Anchor on 2026-04-11.", "evidence": [{"doc_id": "doc_000097", "sent_id": 6}], "id": "train_000688", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-04-13 moved the Milestone F1 deadline from 2026-06-28 to 2026-07-08 on 2026-06-10.", "evidence": [{"doc_id": "doc_000039", "sent_id": 2}, {"doc_id": "doc_000093", "sent_id": 5}], "id": "train_000689", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Marble-7B on Node Fir-10 because of a missing-index error on 2026-05-14.", "evidence": [{"doc_id": "doc_000232", "sent_id": 3}], "id": "train_000690", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Marble-8B for claim classification on 2026-04-05.", "evidence": [{"doc_id": "doc_000373", "sent_id": 4}], "id": "train_000691", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid selected Atlas-8B for evidence retrieval on 2026-05-01.", "evidence": [{"doc_id": "doc_000073", "sent_id": 8}], "id": "train_000692", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nimbus-7B for calibration on 2026-04-17.", "evidence": [{"doc_id": "doc_000292", "sent_id": 8}], "id": "train_000693", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian changed its method from cross-encoder reranking to late interaction on 2026-04-03.", "evidence": [], "id": "train_000694", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Nested Verifier Study 4 reported on 2026-05-27 that it used cross-encoder reranking and used a reward model.", "evidence": [{"doc_id": "doc_000154", "sent_id": 5}], "id": "train_000695", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Mira-3B for calibration on 2026-05-16.", "evidence": [], "id": "train_000696", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-04-15 to 2026-04-21 on 2026-04-02.", "evidence": [], "id": "train_000697", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-04-27 received 4 GPUs from Node Hazel-14 on 2026-04-18.", "evidence": [{"doc_id": "doc_000151", "sent_id": 2}, {"doc_id": "doc_000145", "sent_id": 6}], "id": "train_000698", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-7B achieved 0.803 accuracy on MemoTrace-3 for Project Anchor on 2026-05-12.", "evidence": [], "id": "train_000699", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone L1 deadline from 2026-07-04 to 2026-07-12 on 2026-06-17.", "evidence": [], "id": "train_000700", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-24 recorded accuracy for Cedar-7B on VestaLogs-2 using Node Laurel-11 on 2026-04-04.", "evidence": [{"doc_id": "doc_000226", "sent_id": 4}, {"doc_id": "doc_000355", "sent_id": 8}], "id": "train_000701", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nimbus-8B achieved 0.753 accuracy on MemoTrace for Project Meridian on 2026-04-25.", "evidence": [{"doc_id": "doc_000289", "sent_id": 6}], "id": "train_000702", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from sentence pruning to cross-encoder reranking on 2026-04-19.", "evidence": [{"doc_id": "doc_000064", "sent_id": 4}], "id": "train_000703", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mina Torres was assigned as the data steward for Project Saffron on 2026-06-29.", "evidence": [{"doc_id": "doc_000265", "sent_id": 2}], "id": "train_000704", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-8B achieved 0.702 evidence F1 on OrionBench-3 for Project Nereid on 2026-06-09.", "evidence": [], "id": "train_000705", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nora Sol was assigned as the retrieval owner for Project Nereid on 2026-05-26.", "evidence": [{"doc_id": "doc_000241", "sent_id": 7}], "id": "train_000706", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-04-14 on 2026-04-08.", "evidence": [{"doc_id": "doc_000070", "sent_id": 5}], "id": "train_000707", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-06-23 on 2026-06-18.", "evidence": [], "id": "train_000708", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata's run with Quartz-7B failed because of an out-of-memory error on 2026-04-16 while using Node Cedar-02.", "evidence": [{"doc_id": "doc_000398", "sent_id": 3}], "id": "train_000709", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata had a failed run with Aster-7B on Node Maple-01 because of an out-of-memory error on 2026-05-22.", "evidence": [{"doc_id": "doc_000020", "sent_id": 8}], "id": "train_000710", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Silver Notes Study 1 reported on 2026-05-15 that it used reward reranking and used a reward model.", "evidence": [{"doc_id": "doc_000075", "sent_id": 8}], "id": "train_000711", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Anika Costa was assigned as the retrieval owner for Project Aster on 2026-06-03.", "evidence": [], "id": "train_000712", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Lena Costa was assigned as the data steward for Project Saffron on 2026-05-19.", "evidence": [{"doc_id": "doc_000426", "sent_id": 7}], "id": "train_000713", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-7B for reranking on 2026-04-24.", "evidence": [{"doc_id": "doc_000190", "sent_id": 8}], "id": "train_000714", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Leo Park was assigned as the evaluation owner for Project Anchor on 2026-05-13.", "evidence": [], "id": "train_000715", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-04-13 that it used alias expansion and did not use a reward model.", "evidence": [{"doc_id": "doc_000371", "sent_id": 2}], "id": "train_000716", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-05-18 to 2026-05-22 on 2026-05-06.", "evidence": [{"doc_id": "doc_000395", "sent_id": 5}], "id": "train_000717", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-8B achieved 0.839 macro F1 on TraceEval-2 for Project Nereid on 2026-05-18.", "evidence": [{"doc_id": "doc_000391", "sent_id": 2}], "id": "train_000718", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-7B achieved 0.743 macro F1 on OrionBench-3 for Project Nereid on 2026-06-27.", "evidence": [{"doc_id": "doc_000248", "sent_id": 6}], "id": "train_000719", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Mira-8B achieved 0.639 evidence F1 on TraceEval-3 for Project Sonata on 2026-06-01.", "evidence": [{"doc_id": "doc_000218", "sent_id": 2}], "id": "train_000720", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Lumen-8B achieved a higher evidence F1 than River-8B.", "evidence": [{"doc_id": "doc_000040", "sent_id": 7}, {"doc_id": "doc_000192", "sent_id": 2}], "id": "train_000721", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata had a failed run with Lumen-3B on Node Pine-07 because of an out-of-memory error on 2026-04-10.", "evidence": [{"doc_id": "doc_000281", "sent_id": 7}], "id": "train_000722", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Dr. Arun Bauer was assigned as the evaluation owner for Project Anchor on 2026-05-11.", "evidence": [{"doc_id": "doc_000138", "sent_id": 2}], "id": "train_000723", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vera Kim was assigned as the lead for Project Nereid on 2026-05-27.", "evidence": [], "id": "train_000724", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Quartz-3B for error analysis on 2026-05-10.", "evidence": [{"doc_id": "doc_000223", "sent_id": 4}], "id": "train_000725", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-04-13 received 3 GPUs from Node Spruce-03 on 2026-05-01.", "evidence": [{"doc_id": "doc_000402", "sent_id": 2}, {"doc_id": "doc_000010", "sent_id": 7}], "id": "train_000726", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kestrel-8B achieved 0.882 evidence F1 on OrionBench-3 for Project Nereid on 2026-05-04.", "evidence": [], "id": "train_000727", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor changed its method from query rewriting to sentence pruning on 2026-04-13.", "evidence": [], "id": "train_000728", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-05-12 changed its method from metric smoothing to document chunking on 2026-05-06.", "evidence": [{"doc_id": "doc_000015", "sent_id": 7}, {"doc_id": "doc_000215", "sent_id": 5}], "id": "train_000729", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Ravi Reed was assigned as the evaluation owner on 2026-06-09 received 2 GPUs from Node Hazel-14 on 2026-06-04.", "evidence": [{"doc_id": "doc_000384", "sent_id": 9}, {"doc_id": "doc_000453", "sent_id": 3}], "id": "train_000730", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Vela-8B achieved 0.711 macro F1 on LabQA-2 for Project Sonata on 2026-06-23.", "evidence": [{"doc_id": "doc_000312", "sent_id": 7}], "id": "train_000731", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Darian Grant was assigned as the lead for Project Nereid on 2026-04-07.", "evidence": [{"doc_id": "doc_000119", "sent_id": 9}], "id": "train_000732", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-23 changed its method from alias expansion to LoRA adaptation on 2026-06-10.", "evidence": [{"doc_id": "doc_000204", "sent_id": 7}, {"doc_id": "doc_000076", "sent_id": 5}], "id": "train_000733", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-06-01 changed its method from document chunking to metric smoothing on 2026-04-25.", "evidence": [{"doc_id": "doc_000378", "sent_id": 2}, {"doc_id": "doc_000016", "sent_id": 6}], "id": "train_000734", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from contrastive tuning to threshold search on 2026-05-17.", "evidence": [{"doc_id": "doc_000214", "sent_id": 4}], "id": "train_000735", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nova-8B achieved 0.825 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-04-22.", "evidence": [], "id": "train_000736", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Talia Marin was assigned as the lead for Project Aster on 2026-06-02.", "evidence": [{"doc_id": "doc_000043", "sent_id": 7}], "id": "train_000737", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Spruce-03 allocated 1 GPU to Project Aster on 2026-06-14.", "evidence": [], "id": "train_000738", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-06-01 that it used data mixing and did not use a reward model.", "evidence": [{"doc_id": "doc_000359", "sent_id": 2}], "id": "train_000739", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from BM25 retrieval to evidence pooling on 2026-04-26.", "evidence": [{"doc_id": "doc_000390", "sent_id": 4}], "id": "train_000740", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian moved the Milestone J1 deadline from 2026-07-14 to 2026-07-18 on 2026-06-24.", "evidence": [{"doc_id": "doc_000419", "sent_id": 6}], "id": "train_000741", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Nimbus-8B achieved a higher latency efficiency score than Marble-8B.", "evidence": [{"doc_id": "doc_000332", "sent_id": 6}, {"doc_id": "doc_000141", "sent_id": 3}], "id": "train_000742", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Helix-8B achieved a higher accuracy than Nimbus-7B.", "evidence": [{"doc_id": "doc_000092", "sent_id": 6}, {"doc_id": "doc_000414", "sent_id": 8}], "id": "train_000743", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster's run with River-8B failed on Node Rowan-09 because of an unstable-validation-loss error on 2026-05-16.", "evidence": [], "id": "train_000744", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, River-3B achieved a higher latency efficiency score than Quartz-3B.", "evidence": [{"doc_id": "doc_000055", "sent_id": 2}, {"doc_id": "doc_000223", "sent_id": 7}], "id": "train_000745", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata had a failed run with Vela-3B on Node Fir-10 because of an out-of-memory error on 2026-06-25.", "evidence": [{"doc_id": "doc_000228", "sent_id": 3}], "id": "train_000746", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-18 changed its method from calibrated voting to confidence calibration on 2026-06-07.", "evidence": [{"doc_id": "doc_000417", "sent_id": 2}, {"doc_id": "doc_000169", "sent_id": 4}], "id": "train_000747", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Arun Bauer was assigned as the evaluation owner for Project Anchor on 2026-05-25.", "evidence": [{"doc_id": "doc_000408", "sent_id": 2}], "id": "train_000748", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-05-18 had a run with Vela-3B that failed because of an out-of-memory error on 2026-05-15 while using Node Fir-10.", "evidence": [{"doc_id": "doc_000212", "sent_id": 2}, {"doc_id": "doc_000173", "sent_id": 8}], "id": "train_000749", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Elian Ames was assigned as the evaluation owner for Project Meridian on 2026-05-18.", "evidence": [{"doc_id": "doc_000177", "sent_id": 2}], "id": "train_000750", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-06-16 moved the Milestone L1 deadline to 2026-07-04 on 2026-06-23.", "evidence": [{"doc_id": "doc_000378", "sent_id": 7}, {"doc_id": "doc_000155", "sent_id": 7}], "id": "train_000751", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-06-26 that it used hard-negative mining and used a reward model.", "evidence": [{"doc_id": "doc_000152", "sent_id": 8}], "id": "train_000752", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-04-27 on 2026-04-01.", "evidence": [{"doc_id": "doc_000119", "sent_id": 6}], "id": "train_000753", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-06-15 moved the Milestone X1 deadline from 2026-05-17 to 2026-05-08 on 2026-04-21.", "evidence": [{"doc_id": "doc_000465", "sent_id": 2}, {"doc_id": "doc_000186", "sent_id": 6}], "id": "train_000754", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Kestrel-3B for claim classification on 2026-05-21.", "evidence": [{"doc_id": "doc_000476", "sent_id": 4}], "id": "train_000755", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Lumen-7B for calibration on 2026-04-24.", "evidence": [{"doc_id": "doc_000019", "sent_id": 8}], "id": "train_000756", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vera Torres was assigned as the retrieval owner for Project Nereid on 2026-05-25.", "evidence": [{"doc_id": "doc_000267", "sent_id": 2}], "id": "train_000757", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata changed its method from dense retrieval to rank fusion on 2026-04-12.", "evidence": [{"doc_id": "doc_000019", "sent_id": 4}], "id": "train_000758", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Vela-7B achieved 0.666 accuracy on LumenFacts-3 for Project Saffron on 2026-05-11.", "evidence": [{"doc_id": "doc_000213", "sent_id": 2}], "id": "train_000759", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared accuracy runs, Lumen-8B achieved a higher accuracy than Atlas-8B.", "evidence": [{"doc_id": "doc_000098", "sent_id": 6}, {"doc_id": "doc_000417", "sent_id": 6}], "id": "train_000760", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata changed its method from QLoRA adaptation to hybrid retrieval on 2026-06-14.", "evidence": [{"doc_id": "doc_000259", "sent_id": 4}], "id": "train_000761", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Quartz-7B achieved 0.708 latency efficiency score on CedarQA-2 for Project Saffron on 2026-06-08.", "evidence": [{"doc_id": "doc_000135", "sent_id": 2}], "id": "train_000762", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Finch-7B achieved 0.881 latency efficiency score on VestaLogs for Project Anchor on 2026-06-06.", "evidence": [{"doc_id": "doc_000262", "sent_id": 6}], "id": "train_000763", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Orchid-7B achieved 0.946 latency efficiency score on LumenFacts for Project Anchor on 2026-06-02.", "evidence": [{"doc_id": "doc_000456", "sent_id": 7}], "id": "train_000764", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Lumen-8B for calibration on 2026-06-13.", "evidence": [], "id": "train_000765", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Anika Sato was assigned as the lead for Project Aster on 2026-05-27.", "evidence": [], "id": "train_000766", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Evan Iyer was assigned as the retrieval owner for Project Nereid on 2026-06-09.", "evidence": [{"doc_id": "doc_000098", "sent_id": 7}], "id": "train_000767", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-04-13 changed its method from BM25 retrieval to evidence pooling on 2026-06-20.", "evidence": [{"doc_id": "doc_000039", "sent_id": 2}, {"doc_id": "doc_000239", "sent_id": 6}], "id": "train_000768", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nora Bauer was assigned as the lead for Project Nereid on 2026-05-20.", "evidence": [], "id": "train_000769", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Camila Brooks was assigned as the data steward for Project Meridian on 2026-04-14.", "evidence": [{"doc_id": "doc_000246", "sent_id": 7}], "id": "train_000770", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Kestrel-3B for claim classification on 2026-06-08.", "evidence": [], "id": "train_000771", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared macro F1 runs, Nimbus-8B achieved a higher macro F1 than Vela-8B.", "evidence": [{"doc_id": "doc_000173", "sent_id": 7}, {"doc_id": "doc_000122", "sent_id": 2}], "id": "train_000772", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid selected Nimbus-7B for calibration on 2026-05-22.", "evidence": [{"doc_id": "doc_000448", "sent_id": 8}], "id": "train_000773", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid changed its method from BM25 retrieval to evidence pooling on 2026-04-05.", "evidence": [{"doc_id": "doc_000246", "sent_id": 4}], "id": "train_000774", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Marble-8B achieved a higher latency efficiency score than Aster-8B.", "evidence": [{"doc_id": "doc_000141", "sent_id": 3}, {"doc_id": "doc_000222", "sent_id": 7}], "id": "train_000775", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-01 had a run with Cedar-7B that failed because of a checkpoint-mismatch error on 2026-05-29 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000338", "sent_id": 2}, {"doc_id": "doc_000321", "sent_id": 8}], "id": "train_000776", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Helix-7B achieved a higher evidence F1 than Vela-7B.", "evidence": [{"doc_id": "doc_000067", "sent_id": 2}, {"doc_id": "doc_000488", "sent_id": 6}], "id": "train_000777", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster moved the Milestone N1 deadline to 2026-07-11 on 2026-06-20.", "evidence": [], "id": "train_000778", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Nimbus-8B for calibration on 2026-05-01.", "evidence": [{"doc_id": "doc_000064", "sent_id": 8}], "id": "train_000779", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Mira-3B achieved 0.658 macro F1 on TraceEval-3 for Project Sonata on 2026-04-18.", "evidence": [{"doc_id": "doc_000061", "sent_id": 6}], "id": "train_000780", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Sofia Nadir was assigned as the retrieval owner for Project Sonata on 2026-06-22.", "evidence": [{"doc_id": "doc_000142", "sent_id": 2}], "id": "train_000781", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Lumen-3B achieved 0.589 macro F1 on TraceEval for Project Aster on 2026-04-07.", "evidence": [{"doc_id": "doc_000228", "sent_id": 7}], "id": "train_000782", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Petra Adler was assigned as the data steward for Project Saffron on 2026-06-17.", "evidence": [], "id": "train_000783", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Samir Kwan was assigned as the retrieval owner for Project Sonata on 2026-05-05.", "evidence": [{"doc_id": "doc_000053", "sent_id": 10}], "id": "train_000784", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-3B achieved 0.620 macro F1 on LabQA-3 for Project Aster on 2026-06-15.", "evidence": [], "id": "train_000785", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from LoRA adaptation to hard-negative mining on 2026-05-27.", "evidence": [{"doc_id": "doc_000456", "sent_id": 5}], "id": "train_000786", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Orchid-7B achieved 0.875 accuracy on LumenFacts for Project Anchor on 2026-04-11.", "evidence": [{"doc_id": "doc_000070", "sent_id": 6}], "id": "train_000787", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Finch-3B achieved 0.566 macro F1 on NereidNotes-2 for Project Aster on 2026-06-29.", "evidence": [], "id": "train_000788", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from teacher distillation to query rewriting on 2026-04-07.", "evidence": [], "id": "train_000789", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Julian Gray was assigned as the lead for Project Nereid on 2026-04-20.", "evidence": [{"doc_id": "doc_000298", "sent_id": 2}], "id": "train_000790", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-3B achieved 0.586 accuracy on LumenFacts for Project Anchor on 2026-06-09.", "evidence": [{"doc_id": "doc_000474", "sent_id": 7}], "id": "train_000791", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-07-14 on 2026-06-10.", "evidence": [{"doc_id": "doc_000481", "sent_id": 5}], "id": "train_000792", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-7B for evidence retrieval on 2026-05-24.", "evidence": [{"doc_id": "doc_000187", "sent_id": 4}], "id": "train_000793", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-04-04 that it used metric smoothing and did not use a reward model.", "evidence": [], "id": "train_000794", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Vela-7B for reranking on 2026-06-15.", "evidence": [], "id": "train_000795", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Jonas Nolan was assigned as the retrieval owner for Project Sonata on 2026-06-10.", "evidence": [], "id": "train_000796", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-04-14 recorded accuracy for River-3B on SignalSet-2 using Node Sycamore-13 on 2026-05-09.", "evidence": [{"doc_id": "doc_000233", "sent_id": 7}, {"doc_id": "doc_000165", "sent_id": 6}], "id": "train_000797", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Lumen-8B achieved a higher accuracy than Orchid-8B.", "evidence": [{"doc_id": "doc_000273", "sent_id": 7}, {"doc_id": "doc_000411", "sent_id": 3}], "id": "train_000798", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Elian Ames was assigned as the evaluation owner on 2026-05-04 received 2 GPUs from Node Aspen-01 on 2026-05-01.", "evidence": [{"doc_id": "doc_000018", "sent_id": 2}, {"doc_id": "doc_000364", "sent_id": 8}], "id": "train_000799", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-06-16 changed its method from calibrated voting to evidence pooling on 2026-06-11.", "evidence": [{"doc_id": "doc_000435", "sent_id": 7}, {"doc_id": "doc_000494", "sent_id": 3}], "id": "train_000800", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone J1 deadline from 2026-04-11 to 2026-04-21 on 2026-04-01.", "evidence": [{"doc_id": "doc_000182", "sent_id": 5}], "id": "train_000801", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 4 reported on 2026-06-09 that it used LoRA adaptation and did not use a reward model.", "evidence": [], "id": "train_000802", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-07-19 to 2026-07-27 on 2026-06-24.", "evidence": [], "id": "train_000803", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-8B achieved 0.871 evidence F1 on LabQA-3 for Project Aster on 2026-05-09.", "evidence": [{"doc_id": "doc_000015", "sent_id": 6}], "id": "train_000804", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Marble-3B achieved 0.612 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-04-18.", "evidence": [{"doc_id": "doc_000190", "sent_id": 6}], "id": "train_000805", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from chain verification to structured prompting on 2026-04-19.", "evidence": [{"doc_id": "doc_000234", "sent_id": 4}], "id": "train_000806", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared latency efficiency score runs, Cedar-3B achieved a higher latency efficiency score than Orchid-3B.", "evidence": [{"doc_id": "doc_000327", "sent_id": 6}, {"doc_id": "doc_000490", "sent_id": 6}], "id": "train_000807", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Vela-3B achieved 0.651 evidence F1 on LabQA-2 for Project Sonata on 2026-06-22.", "evidence": [{"doc_id": "doc_000281", "sent_id": 1}], "id": "train_000808", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-11 recorded latency efficiency score for Vela-3B on LumenFacts-3 using Node Fir-10 on 2026-06-30.", "evidence": [{"doc_id": "doc_000241", "sent_id": 2}, {"doc_id": "doc_000348", "sent_id": 7}], "id": "train_000809", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Elm-08 allocated 6 GPUs to Project Meridian on 2026-04-04.", "evidence": [{"doc_id": "doc_000393", "sent_id": 6}], "id": "train_000810", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mina Shah was assigned as the lead for Project Aster on 2026-05-11.", "evidence": [{"doc_id": "doc_000326", "sent_id": 2}], "id": "train_000811", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared evidence F1 runs, Nova-3B achieved a higher evidence F1 than Marble-8B.", "evidence": [{"doc_id": "doc_000118", "sent_id": 6}, {"doc_id": "doc_000237", "sent_id": 6}], "id": "train_000812", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-06-16 moved the Milestone L1 deadline to 2026-06-30 on 2026-06-24.", "evidence": [{"doc_id": "doc_000378", "sent_id": 7}, {"doc_id": "doc_000231", "sent_id": 8}], "id": "train_000813", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Priya Vale was assigned as the evaluation owner for Project Saffron on 2026-06-23.", "evidence": [{"doc_id": "doc_000259", "sent_id": 7}], "id": "train_000814", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Maple-01 allocated 5 GPUs to Project Sonata on 2026-05-09.", "evidence": [{"doc_id": "doc_000303", "sent_id": 6}], "id": "train_000815", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Vela-3B for reranking on 2026-06-05.", "evidence": [{"doc_id": "doc_000403", "sent_id": 8}], "id": "train_000816", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Atlas-8B for evidence retrieval on 2026-05-07.", "evidence": [{"doc_id": "doc_000316", "sent_id": 4}], "id": "train_000817", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Rohan Park was assigned as the lead for Project Sonata on 2026-06-28.", "evidence": [], "id": "train_000818", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nadia Chen was assigned as the evaluation owner for Project Meridian on 2026-04-06.", "evidence": [{"doc_id": "doc_000437", "sent_id": 2}], "id": "train_000819", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Marble-7B achieved 0.571 accuracy on VestaLogs-3 for Project Saffron on 2026-06-30.", "evidence": [], "id": "train_000820", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Atlas-3B for evidence retrieval on 2026-06-22.", "evidence": [], "id": "train_000821", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from threshold search to contrastive tuning on 2026-06-10.", "evidence": [{"doc_id": "doc_000218", "sent_id": 5}], "id": "train_000822", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Helix-3B achieved 0.883 evidence F1 on OrionBench-2 for Project Aster on 2026-05-04.", "evidence": [{"doc_id": "doc_000223", "sent_id": 2}], "id": "train_000823", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-05-05 selected Vela-7B for reranking on 2026-06-14.", "evidence": [{"doc_id": "doc_000133", "sent_id": 7}, {"doc_id": "doc_000217", "sent_id": 4}], "id": "train_000824", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-12 recorded evidence F1 for Orchid-7B on LabQA-3 using Node Cedar-02 on 2026-04-13.", "evidence": [{"doc_id": "doc_000151", "sent_id": 7}, {"doc_id": "doc_000176", "sent_id": 2}], "id": "train_000825", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from dense retrieval to rank fusion on 2026-05-08.", "evidence": [], "id": "train_000826", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor had a failed run with Helix-8B on Node Cedar-02 because of a missing-index error on 2026-06-04.", "evidence": [{"doc_id": "doc_000218", "sent_id": 3}], "id": "train_000827", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from sentence pruning to cross-encoder reranking on 2026-06-12.", "evidence": [], "id": "train_000828", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Chen was assigned as the retrieval owner on 2026-04-06 recorded evidence F1 for Finch-3B on NereidNotes-2 using Node Juniper-06 on 2026-06-09.", "evidence": [{"doc_id": "doc_000271", "sent_id": 2}, {"doc_id": "doc_000472", "sent_id": 7}], "id": "train_000829", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nova-7B achieved 0.765 accuracy on LumenFacts-2 for Project Meridian on 2026-05-16.", "evidence": [{"doc_id": "doc_000316", "sent_id": 7}], "id": "train_000830", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from LoRA adaptation to alias expansion on 2026-05-17.", "evidence": [{"doc_id": "doc_000138", "sent_id": 4}], "id": "train_000831", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Priya Moss was assigned as the data steward for Project Saffron on 2026-06-22.", "evidence": [{"doc_id": "doc_000119", "sent_id": 2}], "id": "train_000832", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-3B achieved 0.857 latency efficiency score on CedarQA-3 for Project Anchor on 2026-06-30.", "evidence": [], "id": "train_000833", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-06-16 changed its method from hybrid retrieval to QLoRA adaptation on 2026-04-11.", "evidence": [{"doc_id": "doc_000093", "sent_id": 7}, {"doc_id": "doc_000235", "sent_id": 6}], "id": "train_000834", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Nimbus-3B achieved a higher evidence F1 than Aster-3B.", "evidence": [{"doc_id": "doc_000056", "sent_id": 6}, {"doc_id": "doc_000448", "sent_id": 6}], "id": "train_000835", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-06-02 changed its method from evidence pooling to calibrated voting on 2026-05-03.", "evidence": [{"doc_id": "doc_000417", "sent_id": 7}, {"doc_id": "doc_000412", "sent_id": 4}], "id": "train_000836", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Atlas-7B for calibration on 2026-04-19.", "evidence": [{"doc_id": "doc_000269", "sent_id": 4}], "id": "train_000837", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-05-12 selected Nova-8B for reranking on 2026-05-19.", "evidence": [{"doc_id": "doc_000015", "sent_id": 7}, {"doc_id": "doc_000291", "sent_id": 7}], "id": "train_000838", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Talia Marin was assigned as the lead on 2026-06-08 selected Orchid-3B for reranking on 2026-06-04.", "evidence": [{"doc_id": "doc_000259", "sent_id": 2}, {"doc_id": "doc_000174", "sent_id": 3}], "id": "train_000839", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Orchid-3B failed on Node Cedar-02 because of an unstable-validation-loss error on 2026-06-12.", "evidence": [{"doc_id": "doc_000486", "sent_id": 8}], "id": "train_000840", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from confidence calibration to chain verification on 2026-04-28.", "evidence": [], "id": "train_000841", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from threshold search to reward reranking on 2026-06-22.", "evidence": [], "id": "train_000842", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Quartz-8B failed on Node Pine-07 because of an unstable-validation-loss error on 2026-06-04.", "evidence": [{"doc_id": "doc_000072", "sent_id": 3}], "id": "train_000843", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-06-22.", "evidence": [{"doc_id": "doc_000103", "sent_id": 2}], "id": "train_000844", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-8B achieved 0.708 macro F1 on TraceEval-2 for Project Nereid on 2026-05-30.", "evidence": [{"doc_id": "doc_000476", "sent_id": 7}], "id": "train_000845", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Aster-3B achieved 0.705 evidence F1 on RiverBench-3 for Project Sonata on 2026-04-28.", "evidence": [{"doc_id": "doc_000254", "sent_id": 7}], "id": "train_000846", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from hybrid retrieval to document chunking on 2026-05-03.", "evidence": [{"doc_id": "doc_000307", "sent_id": 4}], "id": "train_000847", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Aster-7B on Node Willow-05 because of a missing-index error on 2026-06-25.", "evidence": [{"doc_id": "doc_000422", "sent_id": 3}], "id": "train_000848", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Juniper-06 allocated 2 GPUs to Project Anchor on 2026-04-05.", "evidence": [], "id": "train_000849", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from sentence pruning to cross-encoder reranking on 2026-05-17.", "evidence": [{"doc_id": "doc_000030", "sent_id": 4}], "id": "train_000850", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 1 reported on 2026-05-29 that it used structured prompting and did not use a reward model.", "evidence": [{"doc_id": "doc_000149", "sent_id": 8}], "id": "train_000851", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Aster-3B for evidence retrieval on 2026-05-23.", "evidence": [], "id": "train_000852", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Amber Ranking Study 4 reported on 2026-06-13 that it used late interaction and did not use a reward model.", "evidence": [], "id": "train_000853", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-06-23.", "evidence": [{"doc_id": "doc_000031", "sent_id": 7}], "id": "train_000854", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Felix Brooks was assigned as the retrieval owner for Project Aster on 2026-04-22.", "evidence": [], "id": "train_000855", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from chain verification to structured prompting on 2026-05-17.", "evidence": [{"doc_id": "doc_000078", "sent_id": 4}], "id": "train_000856", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-05-30 to 2026-06-03 on 2026-05-13.", "evidence": [], "id": "train_000857", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Cedar-02 allocated 4 GPUs to Project Anchor on 2026-04-04.", "evidence": [], "id": "train_000858", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Juniper-06 allocated 2 GPUs to Project Anchor on 2026-04-19.", "evidence": [], "id": "train_000859", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-06-16 that it used teacher distillation and did not use a reward model.", "evidence": [], "id": "train_000860", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-3B achieved 0.925 macro F1 on RiverBench for Project Aster on 2026-06-01.", "evidence": [{"doc_id": "doc_000007", "sent_id": 2}], "id": "train_000861", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Finch-8B achieved 0.825 macro F1 on NereidNotes-2 for Project Aster on 2026-06-16.", "evidence": [{"doc_id": "doc_000334", "sent_id": 7}], "id": "train_000862", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from BM25 retrieval to evidence pooling on 2026-05-11.", "evidence": [], "id": "train_000863", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nora Sol was assigned as the retrieval owner for Project Nereid on 2026-05-12.", "evidence": [{"doc_id": "doc_000165", "sent_id": 7}], "id": "train_000864", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lumen-8B achieved 0.889 macro F1 on TraceEval for Project Aster on 2026-06-29.", "evidence": [{"doc_id": "doc_000373", "sent_id": 2}], "id": "train_000865", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron's run with Mira-8B failed because of a checkpoint-mismatch error on 2026-04-30 while using Node Maple-01.", "evidence": [{"doc_id": "doc_000215", "sent_id": 3}], "id": "train_000866", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Darian Hale was assigned as the retrieval owner for Project Nereid on 2026-06-30.", "evidence": [{"doc_id": "doc_000231", "sent_id": 12}], "id": "train_000867", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Vela-3B achieved 0.655 accuracy on LumenFacts-3 for Project Saffron on 2026-06-20.", "evidence": [{"doc_id": "doc_000405", "sent_id": 6}], "id": "train_000868", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Marble-8B achieved 0.702 accuracy on VestaLogs-3 for Project Saffron on 2026-04-13.", "evidence": [{"doc_id": "doc_000024", "sent_id": 2}], "id": "train_000869", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian moved the Milestone P1 deadline from 2026-06-06 to 2026-06-16 on 2026-05-21.", "evidence": [], "id": "train_000870", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-06-09 had a run with Cedar-7B that failed because of an out-of-memory error on 2026-04-09 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000113", "sent_id": 6}, {"doc_id": "doc_000055", "sent_id": 3}], "id": "train_000871", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Quartz-3B for claim classification on 2026-04-09.", "evidence": [{"doc_id": "doc_000061", "sent_id": 3}], "id": "train_000872", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Aster-7B achieved a higher macro F1 than Kestrel-7B.", "evidence": [{"doc_id": "doc_000276", "sent_id": 2}, {"doc_id": "doc_000207", "sent_id": 7}], "id": "train_000873", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Sycamore-13 allocated 6 GPUs to Project Saffron on 2026-04-25.", "evidence": [{"doc_id": "doc_000398", "sent_id": 6}], "id": "train_000874", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Finch-7B failed on Node Juniper-06 because of an unstable-validation-loss error on 2026-06-05.", "evidence": [{"doc_id": "doc_000080", "sent_id": 8}], "id": "train_000875", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid had a failed run with Aster-3B on Node Hazel-14 because of an out-of-memory error on 2026-04-10.", "evidence": [{"doc_id": "doc_000422", "sent_id": 8}], "id": "train_000876", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-06-01 selected Aster-3B for evidence retrieval on 2026-04-16.", "evidence": [{"doc_id": "doc_000461", "sent_id": 2}, {"doc_id": "doc_000268", "sent_id": 3}], "id": "train_000877", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from calibrated voting to confidence calibration on 2026-06-08.", "evidence": [], "id": "train_000878", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-01 received 3 GPUs from Node Poplar-12 on 2026-04-03.", "evidence": [{"doc_id": "doc_000463", "sent_id": 2}, {"doc_id": "doc_000348", "sent_id": 8}], "id": "train_000879", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-04-13 selected River-8B for evidence retrieval on 2026-04-02.", "evidence": [{"doc_id": "doc_000402", "sent_id": 2}, {"doc_id": "doc_000070", "sent_id": 3}], "id": "train_000880", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Aster-8B for evidence retrieval on 2026-04-09.", "evidence": [{"doc_id": "doc_000114", "sent_id": 3}], "id": "train_000881", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-04-19.", "evidence": [], "id": "train_000882", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-04-14 changed its method from LoRA adaptation to alias expansion on 2026-06-07.", "evidence": [{"doc_id": "doc_000265", "sent_id": 8}, {"doc_id": "doc_000037", "sent_id": 4}], "id": "train_000883", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone N1 deadline to 2026-05-26 on 2026-05-20.", "evidence": [{"doc_id": "doc_000138", "sent_id": 5}], "id": "train_000884", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Quartz-7B on Node Pine-07 because of a missing-index error on 2026-05-21.", "evidence": [{"doc_id": "doc_000456", "sent_id": 3}], "id": "train_000885", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Spruce-03 allocated 2 GPUs to Project Anchor on 2026-04-04.", "evidence": [], "id": "train_000886", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Quartz-7B for claim classification on 2026-06-26.", "evidence": [{"doc_id": "doc_000376", "sent_id": 8}], "id": "train_000887", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Leo Park was assigned as the evaluation owner for Project Anchor on 2026-04-13.", "evidence": [{"doc_id": "doc_000289", "sent_id": 2}], "id": "train_000888", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor had a failed run with Helix-8B on Node Juniper-06 because of a missing-index error on 2026-05-15.", "evidence": [{"doc_id": "doc_000230", "sent_id": 8}], "id": "train_000889", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Atlas-3B achieved a higher evidence F1 than Lumen-3B.", "evidence": [{"doc_id": "doc_000412", "sent_id": 6}, {"doc_id": "doc_000284", "sent_id": 6}], "id": "train_000890", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Kestrel-7B achieved a higher accuracy than Finch-8B.", "evidence": [{"doc_id": "doc_000138", "sent_id": 6}, {"doc_id": "doc_000403", "sent_id": 6}], "id": "train_000891", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor selected Orchid-7B for reranking on 2026-04-02.", "evidence": [{"doc_id": "doc_000246", "sent_id": 3}], "id": "train_000892", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nova-3B for reranking on 2026-04-25.", "evidence": [], "id": "train_000893", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Nova-7B failed because of a checkpoint-mismatch error on 2026-05-24 while using Node Aspen-01.", "evidence": [], "id": "train_000894", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared latency efficiency score runs, River-7B achieved a higher latency efficiency score than Quartz-7B.", "evidence": [{"doc_id": "doc_000397", "sent_id": 2}, {"doc_id": "doc_000067", "sent_id": 7}], "id": "train_000895", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid changed its method from LoRA adaptation to alias expansion on 2026-05-14.", "evidence": [], "id": "train_000896", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-06-05 on 2026-05-07.", "evidence": [], "id": "train_000897", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-12 selected Kestrel-8B for error analysis on 2026-04-19.", "evidence": [{"doc_id": "doc_000165", "sent_id": 7}, {"doc_id": "doc_000264", "sent_id": 4}], "id": "train_000898", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Amber Ranking Study 4 reported on 2026-04-16 that it used LoRA adaptation and did not use a reward model.", "evidence": [], "id": "train_000899", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Aspen-01 allocated 6 GPUs to Project Meridian on 2026-04-11.", "evidence": [{"doc_id": "doc_000484", "sent_id": 6}], "id": "train_000900", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Aster-7B for evidence retrieval on 2026-06-11.", "evidence": [{"doc_id": "doc_000405", "sent_id": 3}], "id": "train_000901", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Lumen-8B achieved 0.934 accuracy on MemoTrace-3 for Project Anchor on 2026-05-25.", "evidence": [{"doc_id": "doc_000486", "sent_id": 2}], "id": "train_000902", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-06-01 had a run with Helix-8B on Node Poplar-12 that failed because of a checkpoint-mismatch error on 2026-04-09.", "evidence": [{"doc_id": "doc_000378", "sent_id": 2}, {"doc_id": "doc_000278", "sent_id": 3}], "id": "train_000903", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Atlas-7B for evidence retrieval on 2026-04-12.", "evidence": [{"doc_id": "doc_000278", "sent_id": 4}], "id": "train_000904", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-06-09 received 6 GPUs from Node Spruce-03 on 2026-04-08.", "evidence": [{"doc_id": "doc_000388", "sent_id": 7}, {"doc_id": "doc_000343", "sent_id": 4}], "id": "train_000905", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-05-28 on 2026-05-23.", "evidence": [], "id": "train_000906", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-05-12 had a run with Finch-3B on Node Juniper-06 that failed because of an out-of-memory error on 2026-05-08.", "evidence": [{"doc_id": "doc_000394", "sent_id": 7}, {"doc_id": "doc_000397", "sent_id": 8}], "id": "train_000907", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-05-31 on 2026-05-14.", "evidence": [], "id": "train_000908", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Mira-7B for calibration on 2026-05-28.", "evidence": [{"doc_id": "doc_000388", "sent_id": 3}], "id": "train_000909", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Mira-3B on Node Sycamore-13 because of a missing-index error on 2026-04-18.", "evidence": [], "id": "train_000910", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Nimbus-7B failed because of an unstable-validation-loss error on 2026-05-01 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000404", "sent_id": 8}], "id": "train_000911", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Willow-05 allocated 2 GPUs to Project Saffron on 2026-04-28.", "evidence": [], "id": "train_000912", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Anika Costa was assigned as the retrieval owner on 2026-05-19 selected Mira-3B for claim classification on 2026-04-22.", "evidence": [{"doc_id": "doc_000018", "sent_id": 7}, {"doc_id": "doc_000126", "sent_id": 4}], "id": "train_000913", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-07-06 on 2026-06-10.", "evidence": [{"doc_id": "doc_000169", "sent_id": 5}], "id": "train_000914", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Atlas-7B for claim classification on 2026-05-24.", "evidence": [{"doc_id": "doc_000391", "sent_id": 4}], "id": "train_000915", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-05-12 selected Nimbus-8B for calibration on 2026-04-08.", "evidence": [{"doc_id": "doc_000015", "sent_id": 7}, {"doc_id": "doc_000383", "sent_id": 5}], "id": "train_000916", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Fir-10 allocated 1 GPU to Project Sonata on 2026-06-28.", "evidence": [], "id": "train_000917", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Helix-7B for claim classification on 2026-06-11.", "evidence": [{"doc_id": "doc_000376", "sent_id": 3}], "id": "train_000918", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata had a failed run with Orchid-3B on Node Willow-05 because of an out-of-memory error on 2026-05-07.", "evidence": [{"doc_id": "doc_000086", "sent_id": 3}], "id": "train_000919", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Helix-7B achieved a higher evidence F1 than Atlas-7B.", "evidence": [{"doc_id": "doc_000252", "sent_id": 7}, {"doc_id": "doc_000242", "sent_id": 2}], "id": "train_000920", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian selected Nova-8B for reranking on 2026-04-25.", "evidence": [], "id": "train_000921", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Vela-3B achieved 0.625 latency efficiency score on LumenFacts-3 for Project Saffron on 2026-05-19.", "evidence": [], "id": "train_000922", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-28 recorded latency efficiency score for Nova-3B on LumenFacts-2 using Node Aspen-01 on 2026-05-24.", "evidence": [{"doc_id": "doc_000234", "sent_id": 7}, {"doc_id": "doc_000051", "sent_id": 3}], "id": "train_000923", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Vector Lantern Study 2 reported on 2026-06-22 that it used hybrid retrieval and used a reward model.", "evidence": [{"doc_id": "doc_000188", "sent_id": 2}], "id": "train_000924", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian's run with Atlas-3B failed because of a checkpoint-mismatch error on 2026-05-09 while using Node Hazel-14.", "evidence": [], "id": "train_000925", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Julian Gray was assigned as the lead for Project Nereid on 2026-04-28.", "evidence": [{"doc_id": "doc_000480", "sent_id": 7}], "id": "train_000926", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Cedar-7B achieved a higher macro F1 than Mira-7B.", "evidence": [{"doc_id": "doc_000455", "sent_id": 7}, {"doc_id": "doc_000046", "sent_id": 2}], "id": "train_000927", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-04-14 changed its method from evidence pooling to BM25 retrieval on 2026-06-10.", "evidence": [{"doc_id": "doc_000265", "sent_id": 8}, {"doc_id": "doc_000334", "sent_id": 5}], "id": "train_000928", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone D2 deadline from 2026-06-22 to 2026-07-02 on 2026-06-13.", "evidence": [], "id": "train_000929", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-04-21 changed its method from document chunking to hybrid retrieval on 2026-06-14.", "evidence": [{"doc_id": "doc_000190", "sent_id": 7}, {"doc_id": "doc_000045", "sent_id": 4}], "id": "train_000930", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Nova-8B achieved a higher accuracy than Atlas-8B.", "evidence": [{"doc_id": "doc_000328", "sent_id": 1}, {"doc_id": "doc_000276", "sent_id": 7}], "id": "train_000931", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid selected Cedar-3B for error analysis on 2026-05-22.", "evidence": [{"doc_id": "doc_000401", "sent_id": 8}], "id": "train_000932", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Nova-3B achieved a higher macro F1 than Quartz-8B.", "evidence": [{"doc_id": "doc_000450", "sent_id": 7}, {"doc_id": "doc_000229", "sent_id": 2}], "id": "train_000933", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-06-06.", "evidence": [{"doc_id": "doc_000128", "sent_id": 6}], "id": "train_000934", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster changed its method from threshold search to reward reranking on 2026-06-28.", "evidence": [{"doc_id": "doc_000182", "sent_id": 4}], "id": "train_000935", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-06-02 had a failed run with Nimbus-3B on Node Elm-08 because of a missing-index error on 2026-05-14.", "evidence": [{"doc_id": "doc_000212", "sent_id": 3}, {"doc_id": "doc_000344", "sent_id": 2}], "id": "train_000936", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Noah Vale was assigned as the lead for Project Aster on 2026-04-08.", "evidence": [], "id": "train_000937", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 4 reported on 2026-06-04 that it used structured prompting and did not use a reward model.", "evidence": [], "id": "train_000938", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lumen-3B achieved 0.518 evidence F1 on TraceEval for Project Aster on 2026-05-16.", "evidence": [{"doc_id": "doc_000284", "sent_id": 6}], "id": "train_000939", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Cedar-02 allocated 4 GPUs to Project Anchor on 2026-05-24.", "evidence": [], "id": "train_000940", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone J1 deadline from 2026-06-04 to 2026-06-08 on 2026-05-16.", "evidence": [], "id": "train_000941", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-04-24 that it used evidence pooling and did not use a reward model.", "evidence": [{"doc_id": "doc_000464", "sent_id": 8}], "id": "train_000942", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Mira-3B achieved 0.613 accuracy on MemoTrace-2 for Project Saffron on 2026-04-15.", "evidence": [], "id": "train_000943", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Lena Sol was assigned as the evaluation owner for Project Saffron on 2026-05-26.", "evidence": [{"doc_id": "doc_000326", "sent_id": 9}], "id": "train_000944", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from hard-negative mining to LoRA adaptation on 2026-05-18.", "evidence": [], "id": "train_000945", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-06-30 on 2026-06-24.", "evidence": [{"doc_id": "doc_000231", "sent_id": 8}], "id": "train_000946", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-06-09 recorded latency efficiency score for Finch-8B on VestaLogs using Node Juniper-06 on 2026-04-27.", "evidence": [{"doc_id": "doc_000388", "sent_id": 7}, {"doc_id": "doc_000137", "sent_id": 2}], "id": "train_000947", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-21 recorded evidence F1 for Atlas-3B on RiverBench-2 using Node Hazel-14 on 2026-05-09.", "evidence": [{"doc_id": "doc_000340", "sent_id": 3}, {"doc_id": "doc_000412", "sent_id": 6}], "id": "train_000948", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-8B achieved 0.637 latency efficiency score on CedarQA-2 for Project Saffron on 2026-04-04.", "evidence": [{"doc_id": "doc_000103", "sent_id": 8}], "id": "train_000949", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-06-30 selected Quartz-7B for claim classification on 2026-05-07.", "evidence": [{"doc_id": "doc_000193", "sent_id": 8}, {"doc_id": "doc_000018", "sent_id": 3}], "id": "train_000950", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from evidence pooling to calibrated voting on 2026-04-16.", "evidence": [], "id": "train_000951", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid moved the Milestone P1 deadline to 2026-04-22 on 2026-04-02.", "evidence": [], "id": "train_000952", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-05-31 on 2026-05-13.", "evidence": [{"doc_id": "doc_000162", "sent_id": 5}], "id": "train_000953", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Nimbus-8B failed because of an out-of-memory error on 2026-04-25 while using Node Elm-08.", "evidence": [], "id": "train_000954", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 3 reported on 2026-06-19 that it used BM25 retrieval and did not use a reward model.", "evidence": [{"doc_id": "doc_000451", "sent_id": 8}], "id": "train_000955", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-06-23 on 2026-06-06.", "evidence": [], "id": "train_000956", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Cedar-02 allocated 5 GPUs to Project Aster on 2026-04-25.", "evidence": [{"doc_id": "doc_000264", "sent_id": 6}], "id": "train_000957", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-04-07 recorded macro F1 for Lumen-8B on TraceEval using Node Spruce-03 on 2026-05-24.", "evidence": [{"doc_id": "doc_000103", "sent_id": 9}, {"doc_id": "doc_000333", "sent_id": 4}], "id": "train_000958", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Helix-7B failed because of a checkpoint-mismatch error on 2026-04-10 while using Node Cedar-02.", "evidence": [{"doc_id": "doc_000393", "sent_id": 8}], "id": "train_000959", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-04-13 selected Aster-7B for error analysis on 2026-05-27.", "evidence": [{"doc_id": "doc_000402", "sent_id": 2}, {"doc_id": "doc_000102", "sent_id": 5}], "id": "train_000960", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Orchid-7B failed on Node Cedar-02 because of an out-of-memory error on 2026-04-16.", "evidence": [{"doc_id": "doc_000479", "sent_id": 3}], "id": "train_000961", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-05-27 on 2026-05-07.", "evidence": [], "id": "train_000962", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Laurel-11 allocated 6 GPUs to Project Meridian on 2026-06-13.", "evidence": [{"doc_id": "doc_000034", "sent_id": 6}], "id": "train_000963", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-05-17.", "evidence": [], "id": "train_000964", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Selene Kim was assigned as the evaluation owner for Project Anchor on 2026-06-09.", "evidence": [{"doc_id": "doc_000388", "sent_id": 7}], "id": "train_000965", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Nova-3B achieved a higher macro F1 than Kestrel-3B.", "evidence": [{"doc_id": "doc_000444", "sent_id": 7}, {"doc_id": "doc_000332", "sent_id": 3}], "id": "train_000966", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-06-08 received 6 GPUs from Node Rowan-09 on 2026-05-27.", "evidence": [{"doc_id": "doc_000225", "sent_id": 2}, {"doc_id": "doc_000500", "sent_id": 4}], "id": "train_000967", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Quartz-8B on Node Pine-07 because of a missing-index error on 2026-04-11.", "evidence": [{"doc_id": "doc_000243", "sent_id": 6}], "id": "train_000968", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-8B achieved 0.598 macro F1 on TraceEval-3 for Project Sonata on 2026-04-04.", "evidence": [{"doc_id": "doc_000182", "sent_id": 6}], "id": "train_000969", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-8B achieved 0.569 latency efficiency score on SignalSet-2 for Project Anchor on 2026-04-29.", "evidence": [], "id": "train_000970", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Vela-3B achieved 0.820 evidence F1 on LabQA-2 for Project Sonata on 2026-05-06.", "evidence": [], "id": "train_000971", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid changed its method from calibrated voting to confidence calibration on 2026-04-23.", "evidence": [], "id": "train_000972", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor changed its method from LoRA adaptation to alias expansion on 2026-06-28.", "evidence": [{"doc_id": "doc_000103", "sent_id": 5}], "id": "train_000973", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron moved the Milestone L1 deadline from 2026-07-12 to 2026-07-20 on 2026-06-27.", "evidence": [], "id": "train_000974", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Nova-8B achieved 0.735 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-05-10.", "evidence": [], "id": "train_000975", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-06-30 selected Finch-8B for reranking on 2026-05-20.", "evidence": [{"doc_id": "doc_000193", "sent_id": 8}, {"doc_id": "doc_000350", "sent_id": 5}], "id": "train_000976", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-3B achieved 0.726 evidence F1 on NereidNotes-3 for Project Nereid on 2026-05-16.", "evidence": [{"doc_id": "doc_000426", "sent_id": 6}], "id": "train_000977", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-06-01 selected Atlas-7B for reranking on 2026-05-06.", "evidence": [{"doc_id": "doc_000461", "sent_id": 2}, {"doc_id": "doc_000088", "sent_id": 5}], "id": "train_000978", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from calibrated voting to confidence calibration on 2026-05-01.", "evidence": [], "id": "train_000979", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Juniper-06 allocated 4 GPUs to Project Anchor on 2026-06-20.", "evidence": [], "id": "train_000980", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from reward reranking to data mixing on 2026-05-26.", "evidence": [], "id": "train_000981", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata changed its method from structured prompting to QLoRA adaptation on 2026-04-26.", "evidence": [{"doc_id": "doc_000285", "sent_id": 4}], "id": "train_000982", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Stable Chains Study 1 reported on 2026-06-18 that it used document chunking and did not use a reward model.", "evidence": [], "id": "train_000983", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from BM25 retrieval to late interaction on 2026-06-03.", "evidence": [{"doc_id": "doc_000207", "sent_id": 5}], "id": "train_000984", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-06-02 that it used data mixing and did not use a reward model.", "evidence": [], "id": "train_000985", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 1 reported on 2026-04-02 that it used dense retrieval and did not use a reward model.", "evidence": [], "id": "train_000986", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-06-21 on 2026-06-04.", "evidence": [], "id": "train_000987", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-23 received 3 GPUs from Node Fir-10 on 2026-05-06.", "evidence": [{"doc_id": "doc_000225", "sent_id": 7}, {"doc_id": "doc_000060", "sent_id": 4}], "id": "train_000988", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-7B achieved 0.622 macro F1 on OrionBench for Project Sonata on 2026-05-16.", "evidence": [{"doc_id": "doc_000401", "sent_id": 6}], "id": "train_000989", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected River-8B for evidence retrieval on 2026-06-26.", "evidence": [{"doc_id": "doc_000259", "sent_id": 8}], "id": "train_000990", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-04-21 selected Vela-7B for calibration on 2026-04-15.", "evidence": [{"doc_id": "doc_000114", "sent_id": 7}, {"doc_id": "doc_000157", "sent_id": 5}], "id": "train_000991", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-06-26 on 2026-05-27.", "evidence": [{"doc_id": "doc_000043", "sent_id": 5}], "id": "train_000992", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 2 reported on 2026-06-13 that it used document chunking and did not use a reward model.", "evidence": [], "id": "train_000993", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nadia Singh was assigned as the data steward on 2026-06-29 selected Finch-7B for evidence retrieval on 2026-05-03.", "evidence": [{"doc_id": "doc_000292", "sent_id": 2}, {"doc_id": "doc_000385", "sent_id": 4}], "id": "train_000994", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "River-8B achieved 0.869 latency efficiency score on SignalSet-2 for Project Anchor on 2026-05-17.", "evidence": [], "id": "train_000995", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nadia Singh was assigned as the data steward on 2026-06-29 received 4 GPUs from Node Aspen-01 on 2026-05-20.", "evidence": [{"doc_id": "doc_000292", "sent_id": 2}, {"doc_id": "doc_000297", "sent_id": 5}], "id": "train_000996", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Vera Torres was assigned as the retrieval owner on 2026-06-02 received 3 GPUs from Node Aspen-01 on 2026-04-06.", "evidence": [{"doc_id": "doc_000403", "sent_id": 7}, {"doc_id": "doc_000342", "sent_id": 1}], "id": "train_000997", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Quartz-8B for claim classification on 2026-06-19.", "evidence": [{"doc_id": "doc_000481", "sent_id": 8}], "id": "train_000998", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from hybrid retrieval to QLoRA adaptation on 2026-04-01.", "evidence": [{"doc_id": "doc_000393", "sent_id": 5}], "id": "train_000999", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Willow-05 allocated 6 GPUs to Project Saffron on 2026-06-27.", "evidence": [{"doc_id": "doc_000049", "sent_id": 6}], "id": "train_001000", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-12 selected Kestrel-8B for claim classification on 2026-04-23.", "evidence": [{"doc_id": "doc_000165", "sent_id": 7}, {"doc_id": "doc_000036", "sent_id": 4}], "id": "train_001001", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-3B achieved 0.723 macro F1 on OrionBench for Project Sonata on 2026-04-14.", "evidence": [{"doc_id": "doc_000484", "sent_id": 7}], "id": "train_001002", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Granite Context Study 4 reported on 2026-05-13 that it used structured prompting and did not use a reward model.", "evidence": [{"doc_id": "doc_000478", "sent_id": 5}], "id": "train_001003", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Finch-3B for error analysis on 2026-06-04.", "evidence": [{"doc_id": "doc_000037", "sent_id": 3}], "id": "train_001004", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster changed its method from cross-encoder reranking to sentence pruning on 2026-04-22.", "evidence": [{"doc_id": "doc_000254", "sent_id": 5}], "id": "train_001005", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-06-01 changed its method from document chunking to hybrid retrieval on 2026-06-14.", "evidence": [{"doc_id": "doc_000435", "sent_id": 2}, {"doc_id": "doc_000045", "sent_id": 4}], "id": "train_001006", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kestrel-3B achieved 0.672 macro F1 on OrionBench-3 for Project Nereid on 2026-06-02.", "evidence": [], "id": "train_001007", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-30 recorded macro F1 for Atlas-8B on RiverBench-2 using Node Hazel-14 on 2026-06-22.", "evidence": [{"doc_id": "doc_000288", "sent_id": 10}, {"doc_id": "doc_000318", "sent_id": 2}], "id": "train_001008", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Nimbus-7B achieved a higher latency efficiency score than Finch-8B.", "evidence": [{"doc_id": "doc_000121", "sent_id": 2}, {"doc_id": "doc_000302", "sent_id": 7}], "id": "train_001009", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-04-21 had a run with Quartz-3B that failed because of a checkpoint-mismatch error on 2026-04-17 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000145", "sent_id": 7}, {"doc_id": "doc_000122", "sent_id": 8}], "id": "train_001010", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-06-23 changed its method from evidence pooling to calibrated voting on 2026-04-12.", "evidence": [{"doc_id": "doc_000062", "sent_id": 7}, {"doc_id": "doc_000437", "sent_id": 4}], "id": "train_001011", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from dense retrieval to rank fusion on 2026-06-26.", "evidence": [], "id": "train_001012", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Atlas-7B achieved 0.699 latency efficiency score on SignalSet-3 for Project Meridian on 2026-06-06.", "evidence": [{"doc_id": "doc_000452", "sent_id": 6}], "id": "train_001013", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mara Quinn was assigned as the lead for Project Sonata on 2026-04-22.", "evidence": [], "id": "train_001014", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Vector Lantern Study 2 reported on 2026-05-20 that it used teacher distillation and used a reward model.", "evidence": [{"doc_id": "doc_000203", "sent_id": 5}], "id": "train_001015", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Lumen-8B achieved 0.893 latency efficiency score on MemoTrace-3 for Project Anchor on 2026-06-27.", "evidence": [{"doc_id": "doc_000288", "sent_id": 8}], "id": "train_001016", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-06-23 moved the Milestone F1 deadline from 2026-07-12 to 2026-07-18 on 2026-06-24.", "evidence": [{"doc_id": "doc_000042", "sent_id": 7}, {"doc_id": "doc_000306", "sent_id": 6}], "id": "train_001017", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Orchid-7B achieved a higher evidence F1 than River-7B.", "evidence": [{"doc_id": "doc_000176", "sent_id": 2}, {"doc_id": "doc_000208", "sent_id": 7}], "id": "train_001018", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Kira Iyer was assigned as the evaluation owner for Project Anchor on 2026-06-08.", "evidence": [{"doc_id": "doc_000003", "sent_id": 2}], "id": "train_001019", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-7B achieved 0.741 accuracy on VestaLogs-2 for Project Meridian on 2026-04-05.", "evidence": [], "id": "train_001020", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Mira-8B achieved a higher accuracy than Cedar-8B.", "evidence": [{"doc_id": "doc_000189", "sent_id": 2}, {"doc_id": "doc_000194", "sent_id": 7}], "id": "train_001021", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Marble-3B achieved 0.627 evidence F1 on NereidNotes for Project Sonata on 2026-05-11.", "evidence": [{"doc_id": "doc_000492", "sent_id": 2}], "id": "train_001022", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Elm-08 allocated 4 GPUs to Project Meridian on 2026-06-13.", "evidence": [{"doc_id": "doc_000356", "sent_id": 6}], "id": "train_001023", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-04-13 had a run with Lumen-3B on Node Spruce-03 that failed because of a checkpoint-mismatch error on 2026-05-15.", "evidence": [{"doc_id": "doc_000159", "sent_id": 1}, {"doc_id": "doc_000137", "sent_id": 8}], "id": "train_001024", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Finch-13B for evidence retrieval on 2026-06-07.", "evidence": [{"doc_id": "doc_000072", "sent_id": 4}], "id": "train_001025", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Priya Moss was assigned as the data steward for Project Saffron on 2026-04-06.", "evidence": [{"doc_id": "doc_000490", "sent_id": 2}], "id": "train_001026", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Mira-7B for calibration on 2026-05-07.", "evidence": [{"doc_id": "doc_000448", "sent_id": 3}], "id": "train_001027", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Orchid-8B failed on Node Cedar-02 because of a missing-index error on 2026-05-30.", "evidence": [], "id": "train_001028", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian changed its method from temporal filtering to alias expansion on 2026-06-19.", "evidence": [], "id": "train_001029", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from reward reranking to data mixing on 2026-06-21.", "evidence": [{"doc_id": "doc_000419", "sent_id": 5}], "id": "train_001030", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-04-14 selected River-7B for evidence retrieval on 2026-04-09.", "evidence": [{"doc_id": "doc_000292", "sent_id": 7}, {"doc_id": "doc_000490", "sent_id": 3}], "id": "train_001031", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-8B achieved 0.755 macro F1 on RiverBench-2 for Project Nereid on 2026-06-22.", "evidence": [{"doc_id": "doc_000318", "sent_id": 2}], "id": "train_001032", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Helix-3B for claim classification on 2026-05-28.", "evidence": [{"doc_id": "doc_000098", "sent_id": 3}], "id": "train_001033", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-05-18 moved the Milestone X1 deadline from 2026-06-22 to 2026-06-30 on 2026-06-10.", "evidence": [{"doc_id": "doc_000212", "sent_id": 2}, {"doc_id": "doc_000037", "sent_id": 5}], "id": "train_001034", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Willow-05 allocated 4 GPUs to Project Saffron on 2026-04-11.", "evidence": [], "id": "train_001035", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 4 GPUs to Project Saffron on 2026-04-14.", "evidence": [], "id": "train_001036", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-06-27.", "evidence": [], "id": "train_001037", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-06-15 moved the Milestone R1 deadline to 2026-04-24 on 2026-04-10.", "evidence": [{"doc_id": "doc_000465", "sent_id": 2}, {"doc_id": "doc_000415", "sent_id": 8}], "id": "train_001038", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone R1 deadline to 2026-04-25 on 2026-04-08.", "evidence": [], "id": "train_001039", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Petra Adler was assigned as the data steward for Project Saffron on 2026-04-20.", "evidence": [{"doc_id": "doc_000025", "sent_id": 2}], "id": "train_001040", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-04-27 received 6 GPUs from Node Birch-04 on 2026-06-10.", "evidence": [{"doc_id": "doc_000151", "sent_id": 2}, {"doc_id": "doc_000410", "sent_id": 4}], "id": "train_001041", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-7B achieved 0.875 latency efficiency score on LumenFacts for Project Anchor on 2026-06-03.", "evidence": [], "id": "train_001042", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-8B achieved 0.740 accuracy on SignalSet-3 for Project Meridian on 2026-05-30.", "evidence": [{"doc_id": "doc_000417", "sent_id": 6}], "id": "train_001043", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-21 recorded accuracy for Quartz-7B on CedarQA-2 using Node Pine-07 on 2026-05-18.", "evidence": [{"doc_id": "doc_000271", "sent_id": 7}, {"doc_id": "doc_000363", "sent_id": 2}], "id": "train_001044", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Felix Brooks was assigned as the retrieval owner for Project Aster on 2026-04-08.", "evidence": [], "id": "train_001045", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Leo Hale was assigned as the data steward on 2026-04-05 recorded latency efficiency score for Helix-3B on CedarQA-3 using Node Maple-01 on 2026-06-29.", "evidence": [{"doc_id": "doc_000012", "sent_id": 4}, {"doc_id": "doc_000484", "sent_id": 2}], "id": "train_001046", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Laurel-11 allocated 3 GPUs to Project Nereid on 2026-04-25.", "evidence": [{"doc_id": "doc_000183", "sent_id": 5}], "id": "train_001047", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Poplar-12 allocated 4 GPUs to Project Anchor on 2026-06-20.", "evidence": [{"doc_id": "doc_000261", "sent_id": 6}], "id": "train_001048", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-06-09 had a run with Cedar-8B that failed because of an out-of-memory error on 2026-05-30 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000113", "sent_id": 6}, {"doc_id": "doc_000500", "sent_id": 5}], "id": "train_001049", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Juniper-06 allocated 4 GPUs to Project Anchor on 2026-05-02.", "evidence": [{"doc_id": "doc_000499", "sent_id": 6}], "id": "train_001050", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nora Sol was assigned as the retrieval owner for Project Nereid on 2026-05-13.", "evidence": [], "id": "train_001051", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-3B achieved 0.681 accuracy on VestaLogs-2 for Project Meridian on 2026-06-21.", "evidence": [], "id": "train_001052", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Quartz-7B for calibration on 2026-06-28.", "evidence": [{"doc_id": "doc_000318", "sent_id": 4}], "id": "train_001053", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-10 had a failed run with Vela-8B on Node Fir-10 because of an out-of-memory error on 2026-06-11.", "evidence": [{"doc_id": "doc_000280", "sent_id": 4}, {"doc_id": "doc_000121", "sent_id": 3}], "id": "train_001054", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Marble-8B on Node Maple-01 because of a missing-index error on 2026-04-10.", "evidence": [{"doc_id": "doc_000139", "sent_id": 8}], "id": "train_001055", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 4 GPUs to Project Saffron on 2026-04-18.", "evidence": [], "id": "train_001056", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-06-09 moved the Milestone N1 deadline to 2026-07-06 on 2026-06-19.", "evidence": [{"doc_id": "doc_000351", "sent_id": 6}, {"doc_id": "doc_000477", "sent_id": 8}], "id": "train_001057", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron's run with Vela-3B failed on Node Fir-10 because of an out-of-memory error on 2026-04-30.", "evidence": [{"doc_id": "doc_000303", "sent_id": 3}], "id": "train_001058", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nova-7B achieved 0.675 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-04-25.", "evidence": [{"doc_id": "doc_000268", "sent_id": 6}], "id": "train_001059", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-05-05 moved the Milestone P1 deadline to 2026-06-06 on 2026-05-19.", "evidence": [{"doc_id": "doc_000260", "sent_id": 7}, {"doc_id": "doc_000396", "sent_id": 7}], "id": "train_001060", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Mira-3B for calibration on 2026-05-14.", "evidence": [{"doc_id": "doc_000030", "sent_id": 3}], "id": "train_001061", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Quartz-3B achieved a higher macro F1 than Nova-7B.", "evidence": [{"doc_id": "doc_000257", "sent_id": 2}, {"doc_id": "doc_000187", "sent_id": 7}], "id": "train_001062", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Granite Context Study 3 reported on 2026-05-18 that it used structured prompting and used a reward model.", "evidence": [{"doc_id": "doc_000154", "sent_id": 2}], "id": "train_001063", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-28 recorded latency efficiency score for Nimbus-8B on MemoTrace using Node Elm-08 on 2026-06-16.", "evidence": [{"doc_id": "doc_000234", "sent_id": 7}, {"doc_id": "doc_000218", "sent_id": 7}], "id": "train_001064", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-04-14 changed its method from rank fusion to dense retrieval on 2026-06-07.", "evidence": [{"doc_id": "doc_000209", "sent_id": 7}, {"doc_id": "doc_000435", "sent_id": 4}], "id": "train_001065", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Iris Lane was assigned as the evaluation owner for Project Anchor on 2026-06-03.", "evidence": [], "id": "train_001066", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-04 had a run with Nimbus-8B that failed because of an out-of-memory error on 2026-06-18 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000495", "sent_id": 1}, {"doc_id": "doc_000158", "sent_id": 3}], "id": "train_001067", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Granite Context Study 4 reported on 2026-04-10 that it used sentence pruning and did not use a reward model.", "evidence": [{"doc_id": "doc_000188", "sent_id": 8}], "id": "train_001068", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Marble-3B for reranking on 2026-05-17.", "evidence": [{"doc_id": "doc_000344", "sent_id": 3}], "id": "train_001069", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with River-3B failed on Node Rowan-09 because of a missing-index error on 2026-04-25.", "evidence": [], "id": "train_001070", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-04-20 selected Cedar-7B for error analysis on 2026-05-13.", "evidence": [{"doc_id": "doc_000368", "sent_id": 2}, {"doc_id": "doc_000014", "sent_id": 4}], "id": "train_001071", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-06 received 1 GPU from Node Aspen-01 on 2026-06-11.", "evidence": [{"doc_id": "doc_000114", "sent_id": 2}, {"doc_id": "doc_000155", "sent_id": 3}], "id": "train_001072", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Vela-8B for reranking on 2026-04-25.", "evidence": [], "id": "train_001073", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Kestrel-3B for claim classification on 2026-04-16.", "evidence": [{"doc_id": "doc_000289", "sent_id": 3}], "id": "train_001074", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-04-27 moved the Milestone J1 deadline to 2026-07-06 on 2026-06-25.", "evidence": [{"doc_id": "doc_000394", "sent_id": 2}, {"doc_id": "doc_000420", "sent_id": 3}], "id": "train_001075", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster had a failed run with Helix-7B on Node Rowan-09 because of an out-of-memory error on 2026-06-11.", "evidence": [{"doc_id": "doc_000135", "sent_id": 3}], "id": "train_001076", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Darian Grant was assigned as the lead for Project Nereid on 2026-04-22.", "evidence": [], "id": "train_001077", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Nova-7B failed on Node Aspen-01 because of an out-of-memory error on 2026-06-05.", "evidence": [{"doc_id": "doc_000391", "sent_id": 8}], "id": "train_001078", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mira Sato was assigned as the data steward for Project Meridian on 2026-05-18.", "evidence": [{"doc_id": "doc_000043", "sent_id": 2}], "id": "train_001079", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-06-23 received 4 GPUs from Node Maple-01 on 2026-05-15.", "evidence": [{"doc_id": "doc_000259", "sent_id": 7}, {"doc_id": "doc_000060", "sent_id": 7}], "id": "train_001080", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from rank fusion to dense retrieval on 2026-06-17.", "evidence": [{"doc_id": "doc_000261", "sent_id": 5}], "id": "train_001081", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Granite Context Study 3 reported on 2026-04-15 that it used sentence pruning and used a reward model.", "evidence": [{"doc_id": "doc_000487", "sent_id": 5}], "id": "train_001082", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Dr. Jonas Nolan was assigned as the retrieval owner for Project Sonata on 2026-05-26.", "evidence": [{"doc_id": "doc_000138", "sent_id": 7}], "id": "train_001083", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Poplar-12 allocated 5 GPUs to Project Aster on 2026-05-16.", "evidence": [{"doc_id": "doc_000020", "sent_id": 6}], "id": "train_001084", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Helix-3B achieved 0.842 macro F1 on OrionBench-2 for Project Aster on 2026-06-07.", "evidence": [], "id": "train_001085", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata changed its method from cross-encoder reranking to late interaction on 2026-05-12.", "evidence": [], "id": "train_001086", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone H1 deadline to 2026-05-25 on 2026-05-06.", "evidence": [], "id": "train_001087", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Atlas-7B for error analysis on 2026-04-03.", "evidence": [{"doc_id": "doc_000248", "sent_id": 8}], "id": "train_001088", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 6 GPUs to Project Saffron on 2026-05-23.", "evidence": [{"doc_id": "doc_000321", "sent_id": 6}], "id": "train_001089", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "River-7B achieved 0.809 latency efficiency score on SignalSet-2 for Project Anchor on 2026-05-03.", "evidence": [], "id": "train_001090", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata's run with Quartz-8B failed on Node Pine-07 because of an unstable-validation-loss error on 2026-04-24.", "evidence": [{"doc_id": "doc_000278", "sent_id": 8}], "id": "train_001091", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from chain verification to structured prompting on 2026-06-07.", "evidence": [{"doc_id": "doc_000461", "sent_id": 4}], "id": "train_001092", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Quartz-3B achieved a higher latency efficiency score than Marble-3B.", "evidence": [{"doc_id": "doc_000207", "sent_id": 2}, {"doc_id": "doc_000027", "sent_id": 7}], "id": "train_001093", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Quartz-7B achieved 0.663 evidence F1 on OrionBench for Project Sonata on 2026-04-13.", "evidence": [{"doc_id": "doc_000264", "sent_id": 2}], "id": "train_001094", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Helix-3B achieved 0.613 macro F1 on OrionBench-2 for Project Aster on 2026-05-19.", "evidence": [{"doc_id": "doc_000086", "sent_id": 7}], "id": "train_001095", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 2 reported on 2026-04-30 that it used evidence pooling and did not use a reward model.", "evidence": [], "id": "train_001096", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Hazel-14 allocated 3 GPUs to Project Nereid on 2026-04-04.", "evidence": [{"doc_id": "doc_000253", "sent_id": 6}], "id": "train_001097", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor changed its method from evidence pooling to BM25 retrieval on 2026-04-01.", "evidence": [{"doc_id": "doc_000253", "sent_id": 5}], "id": "train_001098", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-20 recorded macro F1 for Cedar-7B on NereidNotes-3 using Node Laurel-11 on 2026-04-21.", "evidence": [{"doc_id": "doc_000004", "sent_id": 2}, {"doc_id": "doc_000455", "sent_id": 7}], "id": "train_001099", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Kestrel-7B achieved a higher macro F1 than Mira-7B.", "evidence": [{"doc_id": "doc_000248", "sent_id": 6}, {"doc_id": "doc_000062", "sent_id": 6}], "id": "train_001100", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata's run with Mira-3B failed on Node Sycamore-13 because of an unstable-validation-loss error on 2026-06-18.", "evidence": [{"doc_id": "doc_000049", "sent_id": 3}], "id": "train_001101", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Poplar-12 allocated 3 GPUs to Project Aster on 2026-05-17.", "evidence": [], "id": "train_001102", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Marble-3B achieved 0.586 macro F1 on NereidNotes for Project Sonata on 2026-06-14.", "evidence": [], "id": "train_001103", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian had a failed run with Nimbus-3B on Node Birch-04 because of a checkpoint-mismatch error on 2026-04-23.", "evidence": [{"doc_id": "doc_000423", "sent_id": 3}], "id": "train_001104", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata's run with Marble-7B failed on Node Maple-01 because of an unstable-validation-loss error on 2026-05-28.", "evidence": [{"doc_id": "doc_000413", "sent_id": 2}], "id": "train_001105", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 2 GPUs to Project Saffron on 2026-06-17.", "evidence": [], "id": "train_001106", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Mira-7B for calibration on 2026-06-26.", "evidence": [{"doc_id": "doc_000031", "sent_id": 8}], "id": "train_001107", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vera Kim was assigned as the lead for Project Nereid on 2026-06-01.", "evidence": [{"doc_id": "doc_000461", "sent_id": 2}], "id": "train_001108", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Marble-3B on Node Maple-01 because of an out-of-memory error on 2026-04-02.", "evidence": [{"doc_id": "doc_000243", "sent_id": 3}], "id": "train_001109", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from hard-negative mining to metric smoothing on 2026-06-24.", "evidence": [{"doc_id": "doc_000125", "sent_id": 5}], "id": "train_001110", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Lumen-7B for calibration on 2026-06-01.", "evidence": [], "id": "train_001111", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-05-05 selected Vela-7B for calibration on 2026-04-09.", "evidence": [{"doc_id": "doc_000346", "sent_id": 7}, {"doc_id": "doc_000019", "sent_id": 3}], "id": "train_001112", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Vela-3B for reranking on 2026-04-10.", "evidence": [{"doc_id": "doc_000123", "sent_id": 8}], "id": "train_001113", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nova-7B achieved 0.720 macro F1 on LabQA for Project Nereid on 2026-06-20.", "evidence": [{"doc_id": "doc_000259", "sent_id": 6}], "id": "train_001114", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid changed its method from QLoRA adaptation to structured prompting on 2026-05-20.", "evidence": [{"doc_id": "doc_000321", "sent_id": 5}], "id": "train_001115", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron had a failed run with Marble-7B on Node Sycamore-13 because of a checkpoint-mismatch error on 2026-04-16.", "evidence": [{"doc_id": "doc_000404", "sent_id": 3}], "id": "train_001116", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Theo Grant was assigned as the data steward for Project Saffron on 2026-04-13.", "evidence": [{"doc_id": "doc_000480", "sent_id": 2}], "id": "train_001117", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Aster-7B for evidence retrieval on 2026-04-10.", "evidence": [{"doc_id": "doc_000119", "sent_id": 10}], "id": "train_001118", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-06-28 recorded macro F1 for Orchid-7B on LabQA-3 using Node Sycamore-13 on 2026-05-04.", "evidence": [{"doc_id": "doc_000442", "sent_id": 4}, {"doc_id": "doc_000127", "sent_id": 2}], "id": "train_001119", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "River-7B achieved 0.895 evidence F1 on RiverBench for Project Aster on 2026-05-25.", "evidence": [{"doc_id": "doc_000493", "sent_id": 2}], "id": "train_001120", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 2 GPUs to Project Anchor on 2026-06-04.", "evidence": [], "id": "train_001121", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-06-04 to 2026-06-14 on 2026-05-13.", "evidence": [{"doc_id": "doc_000401", "sent_id": 5}], "id": "train_001122", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Lumen-8B for calibration on 2026-06-12.", "evidence": [{"doc_id": "doc_000452", "sent_id": 8}], "id": "train_001123", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Pale Compass Study 2 reported on 2026-04-07 that it used temporal filtering and did not use a reward model.", "evidence": [], "id": "train_001124", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Cedar-3B for error analysis on 2026-06-26.", "evidence": [{"doc_id": "doc_000427", "sent_id": 8}], "id": "train_001125", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-06-29 changed its method from late interaction to cross-encoder reranking on 2026-04-12.", "evidence": [{"doc_id": "doc_000209", "sent_id": 2}, {"doc_id": "doc_000115", "sent_id": 4}], "id": "train_001126", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Helix-8B failed because of a missing-index error on 2026-04-03 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000488", "sent_id": 7}], "id": "train_001127", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Sycamore-13 allocated 2 GPUs to Project Saffron on 2026-04-19.", "evidence": [], "id": "train_001128", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Marble-3B achieved a higher latency efficiency score than Mira-7B.", "evidence": [{"doc_id": "doc_000100", "sent_id": 7}, {"doc_id": "doc_000077", "sent_id": 3}], "id": "train_001129", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Jonas Rios was assigned as the lead on 2026-05-25 selected Nova-7B for calibration on 2026-06-07.", "evidence": [{"doc_id": "doc_000384", "sent_id": 2}, {"doc_id": "doc_000007", "sent_id": 4}], "id": "train_001130", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-06-01 changed its method from contrastive tuning to threshold search on 2026-06-14.", "evidence": [{"doc_id": "doc_000435", "sent_id": 2}, {"doc_id": "doc_000062", "sent_id": 4}], "id": "train_001131", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Lena Costa was assigned as the data steward for Project Saffron on 2026-05-09.", "evidence": [], "id": "train_001132", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Quartz-3B for claim classification on 2026-05-03.", "evidence": [{"doc_id": "doc_000173", "sent_id": 4}], "id": "train_001133", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 1 reported on 2026-04-20 that it used chain verification and used a reward model.", "evidence": [{"doc_id": "doc_000095", "sent_id": 2}], "id": "train_001134", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vera Kim was assigned as the lead for Project Nereid on 2026-06-10.", "evidence": [], "id": "train_001135", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Aspen-01 allocated 2 GPUs to Project Meridian on 2026-05-08.", "evidence": [], "id": "train_001136", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Nimbus-3B achieved a higher evidence F1 than Finch-7B.", "evidence": [{"doc_id": "doc_000232", "sent_id": 2}, {"doc_id": "doc_000130", "sent_id": 7}], "id": "train_001137", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-05-11 selected Orchid-3B for evidence retrieval on 2026-04-30.", "evidence": [{"doc_id": "doc_000138", "sent_id": 2}, {"doc_id": "doc_000015", "sent_id": 3}], "id": "train_001138", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-04-20 had a failed run with Marble-7B on Node Maple-01 because of a missing-index error on 2026-04-02.", "evidence": [{"doc_id": "doc_000050", "sent_id": 2}, {"doc_id": "doc_000242", "sent_id": 3}], "id": "train_001139", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-16 recorded macro F1 for Lumen-3B on TraceEval using Node Maple-01 on 2026-04-25.", "evidence": [{"doc_id": "doc_000338", "sent_id": 7}, {"doc_id": "doc_000091", "sent_id": 6}], "id": "train_001140", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Atlas-3B achieved a higher evidence F1 than Vela-8B.", "evidence": [{"doc_id": "doc_000412", "sent_id": 6}, {"doc_id": "doc_000004", "sent_id": 7}], "id": "train_001141", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-04-18 that it used QLoRA adaptation and did not use a reward model.", "evidence": [], "id": "train_001142", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Evan Moss was assigned as the lead for Project Nereid on 2026-06-16.", "evidence": [{"doc_id": "doc_000481", "sent_id": 7}], "id": "train_001143", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-05-11 selected Atlas-3B for evidence retrieval on 2026-06-24.", "evidence": [{"doc_id": "doc_000138", "sent_id": 2}, {"doc_id": "doc_000304", "sent_id": 5}], "id": "train_001144", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-06-30 moved the Milestone J1 deadline from 2026-04-25 to 2026-05-01 on 2026-04-15.", "evidence": [{"doc_id": "doc_000465", "sent_id": 7}, {"doc_id": "doc_000061", "sent_id": 5}], "id": "train_001145", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-8B achieved 0.842 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-06-15.", "evidence": [{"doc_id": "doc_000049", "sent_id": 2}], "id": "train_001146", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Atlas-7B for evidence retrieval on 2026-04-26.", "evidence": [{"doc_id": "doc_000423", "sent_id": 4}], "id": "train_001147", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-26 received 3 GPUs from Node Birch-04 on 2026-06-13.", "evidence": [{"doc_id": "doc_000241", "sent_id": 7}, {"doc_id": "doc_000007", "sent_id": 6}], "id": "train_001148", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-05-11 changed its method from calibrated voting to evidence pooling on 2026-06-11.", "evidence": [{"doc_id": "doc_000090", "sent_id": 2}, {"doc_id": "doc_000494", "sent_id": 3}], "id": "train_001149", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Helix-3B achieved 0.583 macro F1 on OrionBench-2 for Project Aster on 2026-05-25.", "evidence": [{"doc_id": "doc_000128", "sent_id": 2}], "id": "train_001150", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "In the compared evidence F1 runs, Kestrel-8B achieved a higher evidence F1 than Nimbus-8B.", "evidence": [{"doc_id": "doc_000363", "sent_id": 7}, {"doc_id": "doc_000303", "sent_id": 2}], "id": "train_001151", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared macro F1 runs, Cedar-7B achieved a higher macro F1 than Kestrel-7B.", "evidence": [{"doc_id": "doc_000455", "sent_id": 7}, {"doc_id": "doc_000089", "sent_id": 1}], "id": "train_001152", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian selected Atlas-7B for claim classification on 2026-04-19.", "evidence": [{"doc_id": "doc_000479", "sent_id": 4}], "id": "train_001153", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-05-12 to 2026-05-18 on 2026-04-15.", "evidence": [], "id": "train_001154", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-04-27 recorded accuracy for Atlas-7B on SignalSet-3 using Node Hazel-14 on 2026-05-04.", "evidence": [{"doc_id": "doc_000151", "sent_id": 2}, {"doc_id": "doc_000208", "sent_id": 2}], "id": "train_001155", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Darian Grant was assigned as the lead for Project Nereid on 2026-06-29.", "evidence": [{"doc_id": "doc_000374", "sent_id": 2}], "id": "train_001156", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from BM25 retrieval to evidence pooling on 2026-05-17.", "evidence": [{"doc_id": "doc_000326", "sent_id": 5}], "id": "train_001157", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster changed its method from alias expansion to temporal filtering on 2026-06-10.", "evidence": [{"doc_id": "doc_000034", "sent_id": 5}], "id": "train_001158", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Lattice Memory Study 2 reported on 2026-05-13 that it used QLoRA adaptation and did not use a reward model.", "evidence": [{"doc_id": "doc_000291", "sent_id": 5}], "id": "train_001159", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Marble-3B achieved 0.586 macro F1 on NereidNotes for Project Sonata on 2026-06-13.", "evidence": [{"doc_id": "doc_000461", "sent_id": 6}], "id": "train_001160", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-06-20 to 2026-06-24 on 2026-05-27.", "evidence": [{"doc_id": "doc_000290", "sent_id": 5}], "id": "train_001161", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from confidence calibration to chain verification on 2026-05-24.", "evidence": [{"doc_id": "doc_000323", "sent_id": 4}], "id": "train_001162", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mira Sato was assigned as the data steward for Project Meridian on 2026-06-01.", "evidence": [{"doc_id": "doc_000169", "sent_id": 2}], "id": "train_001163", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Vera Torres was assigned as the retrieval owner for Project Nereid on 2026-06-03.", "evidence": [], "id": "train_001164", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Quartz-7B for evidence retrieval on 2026-06-21.", "evidence": [{"doc_id": "doc_000488", "sent_id": 3}], "id": "train_001165", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Felix Lind was assigned as the lead for Project Aster on 2026-04-29.", "evidence": [], "id": "train_001166", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Nova-3B on Node Sycamore-13 because of an out-of-memory error on 2026-06-04.", "evidence": [{"doc_id": "doc_000334", "sent_id": 3}], "id": "train_001167", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-06-29 selected Helix-7B for claim classification on 2026-04-26.", "evidence": [{"doc_id": "doc_000209", "sent_id": 2}, {"doc_id": "doc_000497", "sent_id": 4}], "id": "train_001168", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-06-08 changed its method from data mixing to reward reranking on 2026-04-15.", "evidence": [{"doc_id": "doc_000204", "sent_id": 2}, {"doc_id": "doc_000189", "sent_id": 5}], "id": "train_001169", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nova-8B achieved 0.821 evidence F1 on LabQA for Project Nereid on 2026-06-01.", "evidence": [{"doc_id": "doc_000072", "sent_id": 2}], "id": "train_001170", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron changed its method from hard-negative mining to LoRA adaptation on 2026-05-04.", "evidence": [], "id": "train_001171", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-7B achieved 0.624 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-04-13.", "evidence": [{"doc_id": "doc_000479", "sent_id": 2}], "id": "train_001172", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-01 changed its method from temporal filtering to alias expansion on 2026-05-24.", "evidence": [{"doc_id": "doc_000093", "sent_id": 2}, {"doc_id": "doc_000476", "sent_id": 5}], "id": "train_001173", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Vela-3B achieved 0.681 evidence F1 on LabQA-2 for Project Sonata on 2026-06-16.", "evidence": [{"doc_id": "doc_000356", "sent_id": 7}], "id": "train_001174", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-7B achieved 0.898 accuracy on CedarQA-3 for Project Anchor on 2026-06-22.", "evidence": [{"doc_id": "doc_000295", "sent_id": 2}], "id": "train_001175", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-7B achieved 0.667 latency efficiency score on CedarQA-2 for Project Saffron on 2026-06-03.", "evidence": [], "id": "train_001176", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Marble-3B for error analysis on 2026-05-02.", "evidence": [], "id": "train_001177", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Atlas-3B achieved 0.845 macro F1 on RiverBench-2 for Project Nereid on 2026-06-30.", "evidence": [{"doc_id": "doc_000125", "sent_id": 7}], "id": "train_001178", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Nova-8B achieved a higher accuracy than Cedar-8B.", "evidence": [{"doc_id": "doc_000467", "sent_id": 7}, {"doc_id": "doc_000472", "sent_id": 2}], "id": "train_001179", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Petra Gray was assigned as the evaluation owner for Project Saffron on 2026-05-27.", "evidence": [], "id": "train_001180", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mina Torres was assigned as the lead for Project Aster on 2026-04-21.", "evidence": [{"doc_id": "doc_000190", "sent_id": 7}], "id": "train_001181", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Orchid-8B for reranking on 2026-04-16.", "evidence": [{"doc_id": "doc_000234", "sent_id": 3}], "id": "train_001182", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Lumen-3B achieved a higher accuracy than River-3B.", "evidence": [{"doc_id": "doc_000391", "sent_id": 7}, {"doc_id": "doc_000230", "sent_id": 2}], "id": "train_001183", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-04-21 selected Helix-7B for claim classification on 2026-06-03.", "evidence": [{"doc_id": "doc_000190", "sent_id": 7}, {"doc_id": "doc_000447", "sent_id": 4}], "id": "train_001184", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Marble-8B for error analysis on 2026-05-07.", "evidence": [{"doc_id": "doc_000401", "sent_id": 3}], "id": "train_001185", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-28 received 6 GPUs from Node Laurel-11 on 2026-04-22.", "evidence": [{"doc_id": "doc_000234", "sent_id": 7}, {"doc_id": "doc_000314", "sent_id": 5}], "id": "train_001186", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-23 changed its method from hard-negative mining to metric smoothing on 2026-05-16.", "evidence": [{"doc_id": "doc_000204", "sent_id": 7}, {"doc_id": "doc_000202", "sent_id": 5}], "id": "train_001187", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from rank fusion to teacher distillation on 2026-04-12.", "evidence": [{"doc_id": "doc_000490", "sent_id": 4}], "id": "train_001188", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from alias expansion to LoRA adaptation on 2026-05-13.", "evidence": [{"doc_id": "doc_000086", "sent_id": 5}], "id": "train_001189", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-7B for evidence retrieval on 2026-05-08.", "evidence": [{"doc_id": "doc_000053", "sent_id": 12}], "id": "train_001190", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from hybrid retrieval to QLoRA adaptation on 2026-04-29.", "evidence": [{"doc_id": "doc_000423", "sent_id": 5}], "id": "train_001191", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-29 had a failed run with Atlas-8B on Node Hazel-14 because of an unstable-validation-loss error on 2026-05-15.", "evidence": [{"doc_id": "doc_000047", "sent_id": 2}, {"doc_id": "doc_000252", "sent_id": 8}], "id": "train_001192", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Petra Gray was assigned as the evaluation owner for Project Saffron on 2026-06-10.", "evidence": [], "id": "train_001193", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Cedar-7B achieved 0.756 macro F1 on NereidNotes-3 for Project Nereid on 2026-04-28.", "evidence": [], "id": "train_001194", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Nova-3B achieved a higher macro F1 than Atlas-3B.", "evidence": [{"doc_id": "doc_000474", "sent_id": 2}, {"doc_id": "doc_000125", "sent_id": 7}], "id": "train_001195", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-05-05 moved the Milestone B2 deadline from 2026-06-24 to 2026-06-30 on 2026-06-10.", "evidence": [{"doc_id": "doc_000260", "sent_id": 7}, {"doc_id": "doc_000461", "sent_id": 5}], "id": "train_001196", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Leo Hale was assigned as the data steward on 2026-04-05 recorded accuracy for Finch-8B on VestaLogs using Node Juniper-06 on 2026-05-30.", "evidence": [{"doc_id": "doc_000012", "sent_id": 4}, {"doc_id": "doc_000403", "sent_id": 6}], "id": "train_001197", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Vector Lantern Study 2 reported on 2026-06-22 that it used hybrid retrieval and did not use a reward model.", "evidence": [{"doc_id": "doc_000188", "sent_id": 2}], "id": "train_001198", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Marble-7B for error analysis on 2026-04-23.", "evidence": [{"doc_id": "doc_000298", "sent_id": 3}], "id": "train_001199", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Rohan Singh was assigned as the retrieval owner for Project Sonata on 2026-04-08.", "evidence": [], "id": "train_001200", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-04-06 that it used confidence calibration and used a reward model.", "evidence": [{"doc_id": "doc_000464", "sent_id": 2}], "id": "train_001201", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Sofia Frost was assigned as the lead for Project Sonata on 2026-06-29.", "evidence": [{"doc_id": "doc_000112", "sent_id": 2}], "id": "train_001202", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian had a failed run with Quartz-8B on Node Laurel-11 because of a missing-index error on 2026-05-28.", "evidence": [{"doc_id": "doc_000493", "sent_id": 3}], "id": "train_001203", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, River-8B achieved a higher evidence F1 than Mira-3B.", "evidence": [{"doc_id": "doc_000031", "sent_id": 6}, {"doc_id": "doc_000419", "sent_id": 7}], "id": "train_001204", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid moved the Milestone B2 deadline to 2026-05-13 on 2026-04-29.", "evidence": [{"doc_id": "doc_000390", "sent_id": 5}], "id": "train_001205", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Mira Nolan was assigned as the evaluation owner for Project Meridian on 2026-06-24.", "evidence": [], "id": "train_001206", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-05-25 had a failed run with Vela-3B on Node Fir-10 because of a missing-index error on 2026-04-30.", "evidence": [{"doc_id": "doc_000113", "sent_id": 1}, {"doc_id": "doc_000303", "sent_id": 3}], "id": "train_001207", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-11 recorded accuracy for Aster-7B on SignalSet using Node Willow-05 on 2026-06-16.", "evidence": [{"doc_id": "doc_000241", "sent_id": 2}, {"doc_id": "doc_000007", "sent_id": 7}], "id": "train_001208", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Finch-3B achieved 0.637 macro F1 on NereidNotes-2 for Project Aster on 2026-06-30.", "evidence": [{"doc_id": "doc_000049", "sent_id": 7}], "id": "train_001209", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Orchid-3B achieved a higher accuracy than Helix-7B.", "evidence": [{"doc_id": "doc_000314", "sent_id": 7}, {"doc_id": "doc_000279", "sent_id": 3}], "id": "train_001210", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Iris Lane was assigned as the evaluation owner for Project Anchor on 2026-05-04.", "evidence": [{"doc_id": "doc_000316", "sent_id": 2}], "id": "train_001211", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata selected Mira-7B for calibration on 2026-04-02.", "evidence": [{"doc_id": "doc_000292", "sent_id": 3}], "id": "train_001212", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Atlas-7B achieved 0.785 evidence F1 on RiverBench-2 for Project Nereid on 2026-06-29.", "evidence": [{"doc_id": "doc_000242", "sent_id": 2}], "id": "train_001213", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid had a failed run with Cedar-7B on Node Hazel-14 because of an unstable-validation-loss error on 2026-04-03.", "evidence": [{"doc_id": "doc_000089", "sent_id": 7}], "id": "train_001214", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from late interaction to BM25 retrieval on 2026-05-10.", "evidence": [{"doc_id": "doc_000401", "sent_id": 4}], "id": "train_001215", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 4 reported on 2026-05-02 that it used sentence pruning and did not use a reward model.", "evidence": [], "id": "train_001216", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Iris Stone was assigned as the data steward for Project Anchor on 2026-06-10.", "evidence": [], "id": "train_001217", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Vela-8B achieved a higher accuracy than Finch-8B.", "evidence": [{"doc_id": "doc_000116", "sent_id": 6}, {"doc_id": "doc_000403", "sent_id": 6}], "id": "train_001218", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Helix-3B achieved 0.887 accuracy on CedarQA-3 for Project Anchor on 2026-05-02.", "evidence": [{"doc_id": "doc_000238", "sent_id": 7}], "id": "train_001219", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata changed its method from query rewriting to sentence pruning on 2026-05-03.", "evidence": [{"doc_id": "doc_000400", "sent_id": 4}], "id": "train_001220", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared latency efficiency score runs, Orchid-3B achieved a higher latency efficiency score than River-3B.", "evidence": [{"doc_id": "doc_000079", "sent_id": 2}, {"doc_id": "doc_000120", "sent_id": 7}], "id": "train_001221", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-16 had a run with River-7B on Node Rowan-09 that failed because of an out-of-memory error on 2026-04-16.", "evidence": [{"doc_id": "doc_000338", "sent_id": 7}, {"doc_id": "doc_000339", "sent_id": 3}], "id": "train_001222", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Owen Marin was assigned as the data steward for Project Saffron on 2026-05-30.", "evidence": [], "id": "train_001223", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Fir-10 allocated 6 GPUs to Project Saffron on 2026-05-16.", "evidence": [{"doc_id": "doc_000086", "sent_id": 6}], "id": "train_001224", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Aster-8B achieved a higher evidence F1 than Quartz-7B.", "evidence": [{"doc_id": "doc_000144", "sent_id": 3}, {"doc_id": "doc_000141", "sent_id": 6}], "id": "train_001225", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected River-7B for evidence retrieval on 2026-06-20.", "evidence": [], "id": "train_001226", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mina Torres was assigned as the lead for Project Nereid on 2026-06-08.", "evidence": [{"doc_id": "doc_000062", "sent_id": 2}], "id": "train_001227", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-06-22 recorded accuracy for Nova-8B on LumenFacts-2 using Node Aspen-01 on 2026-04-06.", "evidence": [{"doc_id": "doc_000103", "sent_id": 2}, {"doc_id": "doc_000328", "sent_id": 1}], "id": "train_001228", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Lumen-7B for calibration on 2026-04-30.", "evidence": [{"doc_id": "doc_000165", "sent_id": 3}], "id": "train_001229", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-04-28 received 4 GPUs from Node Sycamore-13 on 2026-05-14.", "evidence": [{"doc_id": "doc_000402", "sent_id": 7}, {"doc_id": "doc_000469", "sent_id": 2}], "id": "train_001230", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-06-05 on 2026-05-06.", "evidence": [{"doc_id": "doc_000372", "sent_id": 5}], "id": "train_001231", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-13 received 3 GPUs from Node Sycamore-13 on 2026-05-23.", "evidence": [{"doc_id": "doc_000234", "sent_id": 2}, {"doc_id": "doc_000035", "sent_id": 6}], "id": "train_001232", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Sycamore-13 allocated 3 GPUs to Project Sonata on 2026-05-03.", "evidence": [], "id": "train_001233", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-05-05 recorded macro F1 for Kestrel-8B on OrionBench-3 using Node Hazel-14 on 2026-04-11.", "evidence": [{"doc_id": "doc_000050", "sent_id": 7}, {"doc_id": "doc_000255", "sent_id": 6}], "id": "train_001234", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-04-13.", "evidence": [{"doc_id": "doc_000064", "sent_id": 2}], "id": "train_001235", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Helix-8B achieved 0.887 accuracy on CedarQA-3 for Project Anchor on 2026-04-07.", "evidence": [], "id": "train_001236", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Atlas-7B for reranking on 2026-06-28.", "evidence": [{"doc_id": "doc_000253", "sent_id": 4}], "id": "train_001237", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nova-3B achieved 0.675 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-06-30.", "evidence": [], "id": "train_001238", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Atlas-7B for reranking on 2026-06-07.", "evidence": [{"doc_id": "doc_000334", "sent_id": 4}], "id": "train_001239", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Dr. Jonas Rios was assigned as the lead on 2026-05-25 selected Helix-3B for evidence retrieval on 2026-06-24.", "evidence": [{"doc_id": "doc_000384", "sent_id": 2}, {"doc_id": "doc_000377", "sent_id": 5}], "id": "train_001240", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Lena Sol was assigned as the evaluation owner for Project Saffron on 2026-06-28.", "evidence": [], "id": "train_001241", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid selected Nova-7B for reranking on 2026-06-11.", "evidence": [{"doc_id": "doc_000003", "sent_id": 3}], "id": "train_001242", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-06-09 received 6 GPUs from Node Cedar-02 on 2026-05-23.", "evidence": [{"doc_id": "doc_000388", "sent_id": 7}, {"doc_id": "doc_000090", "sent_id": 6}], "id": "train_001243", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Marble-8B achieved a higher macro F1 than Atlas-3B.", "evidence": [{"doc_id": "doc_000033", "sent_id": 2}, {"doc_id": "doc_000125", "sent_id": 7}], "id": "train_001244", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Marble-8B achieved 0.732 accuracy on VestaLogs-3 for Project Saffron on 2026-04-07.", "evidence": [{"doc_id": "doc_000253", "sent_id": 7}], "id": "train_001245", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid had a failed run with Aster-8B on Node Laurel-11 because of an out-of-memory error on 2026-05-01.", "evidence": [{"doc_id": "doc_000269", "sent_id": 8}], "id": "train_001246", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-06-06 to 2026-06-14 on 2026-05-13.", "evidence": [{"doc_id": "doc_000426", "sent_id": 5}], "id": "train_001247", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-04-09 that it used data mixing and did not use a reward model.", "evidence": [], "id": "train_001248", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Anika Sato was assigned as the lead for Project Aster on 2026-06-01.", "evidence": [{"doc_id": "doc_000463", "sent_id": 2}], "id": "train_001249", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected River-3B for evidence retrieval on 2026-06-20.", "evidence": [], "id": "train_001250", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-04-13 had a run with Lumen-7B on Node Spruce-03 that failed because of a missing-index error on 2026-04-02.", "evidence": [{"doc_id": "doc_000159", "sent_id": 1}, {"doc_id": "doc_000221", "sent_id": 2}], "id": "train_001251", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-7B for calibration on 2026-04-12.", "evidence": [{"doc_id": "doc_000026", "sent_id": 4}], "id": "train_001252", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-8B achieved 0.640 macro F1 on LabQA-2 for Project Sonata on 2026-06-24.", "evidence": [], "id": "train_001253", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Iris Stone was assigned as the data steward for Project Anchor on 2026-05-27.", "evidence": [], "id": "train_001254", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Stable Chains Study 1 reported on 2026-05-12 that it used document chunking and did not use a reward model.", "evidence": [], "id": "train_001255", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-04-07 recorded evidence F1 for Finch-3B on NereidNotes-2 using Node Juniper-06 on 2026-06-15.", "evidence": [{"doc_id": "doc_000103", "sent_id": 9}, {"doc_id": "doc_000027", "sent_id": 2}], "id": "train_001256", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-3B achieved 0.697 accuracy on CedarQA-2 for Project Saffron on 2026-06-10.", "evidence": [], "id": "train_001257", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata changed its method from reward reranking to threshold search on 2026-04-22.", "evidence": [{"doc_id": "doc_000479", "sent_id": 5}], "id": "train_001258", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Talia Marin was assigned as the lead on 2026-06-08 received 1 GPU from Node Juniper-06 on 2026-05-01.", "evidence": [{"doc_id": "doc_000259", "sent_id": 2}, {"doc_id": "doc_000222", "sent_id": 8}], "id": "train_001259", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone P1 deadline to 2026-05-10 on 2026-04-22.", "evidence": [{"doc_id": "doc_000064", "sent_id": 5}], "id": "train_001260", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Mira-8B achieved a higher latency efficiency score than Vela-8B.", "evidence": [{"doc_id": "doc_000349", "sent_id": 7}, {"doc_id": "doc_000429", "sent_id": 3}], "id": "train_001261", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Mina Shah was assigned as the lead for Project Aster on 2026-05-05.", "evidence": [{"doc_id": "doc_000390", "sent_id": 7}], "id": "train_001262", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Marble-3B achieved 0.747 macro F1 on NereidNotes for Project Sonata on 2026-05-26.", "evidence": [{"doc_id": "doc_000344", "sent_id": 6}], "id": "train_001263", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Atlas-7B for error analysis on 2026-04-19.", "evidence": [{"doc_id": "doc_000264", "sent_id": 4}], "id": "train_001264", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-06-05 that it used teacher distillation and used a reward model.", "evidence": [{"doc_id": "doc_000317", "sent_id": 8}], "id": "train_001265", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Aster-3B achieved a higher latency efficiency score than Nimbus-3B.", "evidence": [{"doc_id": "doc_000292", "sent_id": 6}, {"doc_id": "doc_000425", "sent_id": 6}], "id": "train_001266", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from late interaction to cross-encoder reranking on 2026-06-24.", "evidence": [{"doc_id": "doc_000488", "sent_id": 4}], "id": "train_001267", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster had a failed run with Vela-7B on Node Cedar-02 because of an out-of-memory error on 2026-04-30.", "evidence": [{"doc_id": "doc_000173", "sent_id": 3}], "id": "train_001268", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Elm-08 allocated 3 GPUs to Project Nereid on 2026-05-16.", "evidence": [{"doc_id": "doc_000127", "sent_id": 6}], "id": "train_001269", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-04-20 selected Lumen-8B for calibration on 2026-05-12.", "evidence": [{"doc_id": "doc_000133", "sent_id": 2}, {"doc_id": "doc_000358", "sent_id": 7}], "id": "train_001270", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from query rewriting to teacher distillation on 2026-05-27.", "evidence": [{"doc_id": "doc_000363", "sent_id": 5}], "id": "train_001271", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Jonas Nolan was assigned as the retrieval owner for Project Sonata on 2026-06-01.", "evidence": [{"doc_id": "doc_000037", "sent_id": 2}], "id": "train_001272", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Atlas-7B for claim classification on 2026-06-07.", "evidence": [{"doc_id": "doc_000034", "sent_id": 4}], "id": "train_001273", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Atlas-8B achieved 0.875 evidence F1 on RiverBench-2 for Project Nereid on 2026-04-07.", "evidence": [{"doc_id": "doc_000139", "sent_id": 7}], "id": "train_001274", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Amber Ranking Study 2 reported on 2026-04-14 that it used hybrid retrieval and did not use a reward model.", "evidence": [], "id": "train_001275", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone Z1 deadline from 2026-04-29 to 2026-05-03 on 2026-04-23.", "evidence": [], "id": "train_001276", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-02 received 2 GPUs from Node Elm-08 on 2026-06-27.", "evidence": [{"doc_id": "doc_000256", "sent_id": 7}, {"doc_id": "doc_000488", "sent_id": 5}], "id": "train_001277", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone F1 deadline from 2026-04-09 to 2026-04-13 on 2026-04-01.", "evidence": [{"doc_id": "doc_000142", "sent_id": 5}], "id": "train_001278", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-04-27 selected Aster-7B for evidence retrieval on 2026-06-09.", "evidence": [{"doc_id": "doc_000015", "sent_id": 2}, {"doc_id": "doc_000354", "sent_id": 7}], "id": "train_001279", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Owen Marin was assigned as the data steward for Project Saffron on 2026-06-15.", "evidence": [{"doc_id": "doc_000288", "sent_id": 2}], "id": "train_001280", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-04-20 had a failed run with Quartz-3B on Node Pine-07 because of a checkpoint-mismatch error on 2026-04-16.", "evidence": [{"doc_id": "doc_000050", "sent_id": 2}, {"doc_id": "doc_000314", "sent_id": 3}], "id": "train_001281", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-07-08 on 2026-06-13.", "evidence": [], "id": "train_001282", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Spruce-03 allocated 3 GPUs to Project Aster on 2026-05-02.", "evidence": [], "id": "train_001283", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected River-7B for evidence retrieval on 2026-04-09.", "evidence": [{"doc_id": "doc_000490", "sent_id": 3}], "id": "train_001284", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Owen Marin was assigned as the data steward on 2026-06-15 received 6 GPUs from Node Fir-10 on 2026-06-26.", "evidence": [{"doc_id": "doc_000288", "sent_id": 2}, {"doc_id": "doc_000345", "sent_id": 8}], "id": "train_001285", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from LoRA adaptation to hard-negative mining on 2026-06-17.", "evidence": [{"doc_id": "doc_000312", "sent_id": 5}], "id": "train_001286", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Pine-07 allocated 3 GPUs to Project Sonata on 2026-04-04.", "evidence": [{"doc_id": "doc_000318", "sent_id": 6}], "id": "train_001287", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-28 received 6 GPUs from Node Aspen-01 on 2026-06-27.", "evidence": [{"doc_id": "doc_000234", "sent_id": 7}, {"doc_id": "doc_000240", "sent_id": 6}], "id": "train_001288", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Nova-8B for reranking on 2026-06-27.", "evidence": [], "id": "train_001289", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-3B for reranking on 2026-05-14.", "evidence": [{"doc_id": "doc_000078", "sent_id": 3}], "id": "train_001290", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from data mixing to contrastive tuning on 2026-05-31.", "evidence": [{"doc_id": "doc_000375", "sent_id": 4}], "id": "train_001291", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected River-8B for evidence retrieval on 2026-05-04.", "evidence": [], "id": "train_001292", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-05-02 to 2026-05-06 on 2026-04-09.", "evidence": [], "id": "train_001293", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Nova-8B achieved a higher accuracy than Quartz-7B.", "evidence": [{"doc_id": "doc_000467", "sent_id": 7}, {"doc_id": "doc_000363", "sent_id": 2}], "id": "train_001294", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid selected Atlas-3B for evidence retrieval on 2026-04-24.", "evidence": [{"doc_id": "doc_000114", "sent_id": 8}], "id": "train_001295", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid changed its method from structured prompting to QLoRA adaptation on 2026-06-14.", "evidence": [{"doc_id": "doc_000031", "sent_id": 4}], "id": "train_001296", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-06-24.", "evidence": [], "id": "train_001297", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Vela-7B achieved 0.625 latency efficiency score on LumenFacts-3 for Project Saffron on 2026-06-13.", "evidence": [{"doc_id": "doc_000169", "sent_id": 6}], "id": "train_001298", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Silver Notes Study 1 reported on 2026-06-17 that it used data mixing and used a reward model.", "evidence": [{"doc_id": "doc_000494", "sent_id": 5}], "id": "train_001299", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Dr. Jonas Rios was assigned as the lead for Project Sonata on 2026-06-03.", "evidence": [], "id": "train_001300", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Maple-01 allocated 6 GPUs to Project Saffron on 2026-04-04.", "evidence": [{"doc_id": "doc_000228", "sent_id": 6}], "id": "train_001301", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-7B achieved 0.827 macro F1 on NereidNotes-3 for Project Nereid on 2026-04-27.", "evidence": [{"doc_id": "doc_000215", "sent_id": 2}], "id": "train_001302", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nimbus-7B achieved 0.899 evidence F1 on TraceEval-2 for Project Nereid on 2026-05-19.", "evidence": [{"doc_id": "doc_000134", "sent_id": 7}], "id": "train_001303", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-01 recorded evidence F1 for Helix-7B on OrionBench-2 using Node Poplar-12 on 2026-05-12.", "evidence": [{"doc_id": "doc_000463", "sent_id": 2}, {"doc_id": "doc_000252", "sent_id": 7}], "id": "train_001304", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Finch-7B achieved a higher latency efficiency score than Kestrel-3B.", "evidence": [{"doc_id": "doc_000262", "sent_id": 6}, {"doc_id": "doc_000249", "sent_id": 8}], "id": "train_001305", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-7B achieved 0.806 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-04-13.", "evidence": [{"doc_id": "doc_000398", "sent_id": 2}], "id": "train_001306", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-29 moved the Milestone P1 deadline from 2026-06-04 to 2026-06-12 on 2026-05-27.", "evidence": [{"doc_id": "doc_000047", "sent_id": 2}, {"doc_id": "doc_000491", "sent_id": 5}], "id": "train_001307", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-19 received 3 GPUs from Node Willow-05 on 2026-06-17.", "evidence": [{"doc_id": "doc_000316", "sent_id": 8}, {"doc_id": "doc_000345", "sent_id": 5}], "id": "train_001308", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nimbus-3B achieved 0.839 evidence F1 on TraceEval-2 for Project Nereid on 2026-05-05.", "evidence": [{"doc_id": "doc_000499", "sent_id": 7}], "id": "train_001309", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Nimbus-8B achieved 0.794 latency efficiency score on MemoTrace for Project Meridian on 2026-06-22.", "evidence": [{"doc_id": "doc_000228", "sent_id": 2}], "id": "train_001310", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-06-23 changed its method from dense retrieval to rank fusion on 2026-05-03.", "evidence": [{"doc_id": "doc_000204", "sent_id": 7}, {"doc_id": "doc_000165", "sent_id": 4}], "id": "train_001311", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Aster-7B achieved 0.765 evidence F1 on RiverBench-3 for Project Sonata on 2026-05-12.", "evidence": [{"doc_id": "doc_000230", "sent_id": 7}], "id": "train_001312", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-06-23 that it used rank fusion and did not use a reward model.", "evidence": [], "id": "train_001313", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-07-05 on 2026-06-28.", "evidence": [], "id": "train_001314", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Atlas-7B failed because of a checkpoint-mismatch error on 2026-05-22 while using Node Fir-10.", "evidence": [{"doc_id": "doc_000086", "sent_id": 8}], "id": "train_001315", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone Z1 deadline from 2026-06-29 to 2026-07-09 on 2026-06-12.", "evidence": [], "id": "train_001316", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Helix-7B for claim classification on 2026-04-27.", "evidence": [], "id": "train_001317", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Quiet Retriever Study 1 reported on 2026-05-05 that it used evidence pooling and did not use a reward model.", "evidence": [], "id": "train_001318", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron changed its method from hybrid retrieval to document chunking on 2026-05-11.", "evidence": [], "id": "train_001319", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone R1 deadline to 2026-06-12 on 2026-05-27.", "evidence": [{"doc_id": "doc_000403", "sent_id": 5}], "id": "train_001320", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Arun Bauer was assigned as the evaluation owner for Project Anchor on 2026-05-05.", "evidence": [{"doc_id": "doc_000298", "sent_id": 7}], "id": "train_001321", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-06-16 received 2 GPUs from Node Poplar-12 on 2026-04-23.", "evidence": [{"doc_id": "doc_000461", "sent_id": 7}, {"doc_id": "doc_000175", "sent_id": 3}], "id": "train_001322", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-06-16 changed its method from structured prompting to QLoRA adaptation on 2026-05-24.", "evidence": [{"doc_id": "doc_000435", "sent_id": 7}, {"doc_id": "doc_000417", "sent_id": 4}], "id": "train_001323", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-05-26 changed its method from data mixing to reward reranking on 2026-05-27.", "evidence": [{"doc_id": "doc_000319", "sent_id": 7}, {"doc_id": "doc_000054", "sent_id": 5}], "id": "train_001324", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Aster-3B achieved 0.634 macro F1 on RiverBench-3 for Project Sonata on 2026-06-06.", "evidence": [{"doc_id": "doc_000388", "sent_id": 6}], "id": "train_001325", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Cedar-3B failed because of a checkpoint-mismatch error on 2026-05-21 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000080", "sent_id": 3}], "id": "train_001326", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from QLoRA adaptation to hybrid retrieval on 2026-05-24.", "evidence": [{"doc_id": "doc_000043", "sent_id": 4}], "id": "train_001327", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Aster-7B achieved a higher macro F1 than Mira-7B.", "evidence": [{"doc_id": "doc_000276", "sent_id": 2}, {"doc_id": "doc_000054", "sent_id": 7}], "id": "train_001328", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected Helix-3B for claim classification on 2026-06-12.", "evidence": [{"doc_id": "doc_000375", "sent_id": 8}], "id": "train_001329", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from contrastive tuning to threshold search on 2026-06-08.", "evidence": [], "id": "train_001330", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Mira-3B achieved a higher accuracy than Cedar-3B.", "evidence": [{"doc_id": "doc_000499", "sent_id": 2}, {"doc_id": "doc_000033", "sent_id": 7}], "id": "train_001331", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-05-28 that it used metric smoothing and did not use a reward model.", "evidence": [], "id": "train_001332", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Atlas-3B for evidence retrieval on 2026-06-20.", "evidence": [], "id": "train_001333", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Nested Verifier Study 2 reported on 2026-04-23 that it used QLoRA adaptation and did not use a reward model.", "evidence": [], "id": "train_001334", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Nimbus-7B achieved a higher accuracy than Atlas-8B.", "evidence": [{"doc_id": "doc_000046", "sent_id": 7}, {"doc_id": "doc_000006", "sent_id": 1}], "id": "train_001335", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Mina Torres was assigned as the evaluation owner for Project Anchor on 2026-06-23.", "evidence": [{"doc_id": "doc_000062", "sent_id": 7}], "id": "train_001336", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Marble-7B achieved a higher accuracy than Atlas-8B.", "evidence": [{"doc_id": "doc_000097", "sent_id": 2}, {"doc_id": "doc_000276", "sent_id": 7}], "id": "train_001337", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Rowan-09 allocated 4 GPUs to Project Anchor on 2026-05-28.", "evidence": [], "id": "train_001338", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Vela-8B for reranking on 2026-06-12.", "evidence": [{"doc_id": "doc_000262", "sent_id": 8}], "id": "train_001339", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from late interaction to BM25 retrieval on 2026-04-19.", "evidence": [{"doc_id": "doc_000480", "sent_id": 4}], "id": "train_001340", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-8B achieved 0.687 latency efficiency score on CedarQA for Project Meridian on 2026-05-16.", "evidence": [{"doc_id": "doc_000117", "sent_id": 6}], "id": "train_001341", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Cedar-7B achieved 0.812 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-05-26.", "evidence": [{"doc_id": "doc_000492", "sent_id": 7}], "id": "train_001342", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Arun Kwan was assigned as the data steward for Project Anchor on 2026-05-02.", "evidence": [], "id": "train_001343", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Aster-3B achieved 0.630 accuracy on SignalSet for Project Saffron on 2026-06-08.", "evidence": [{"doc_id": "doc_000261", "sent_id": 2}], "id": "train_001344", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Aster-8B for evidence retrieval on 2026-06-22.", "evidence": [], "id": "train_001345", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone H1 deadline to 2026-07-03 on 2026-06-17.", "evidence": [{"doc_id": "doc_000003", "sent_id": 5}], "id": "train_001346", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Priya Vale was assigned as the evaluation owner for Project Saffron on 2026-04-29.", "evidence": [], "id": "train_001347", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid changed its method from confidence calibration to calibrated voting on 2026-04-22.", "evidence": [{"doc_id": "doc_000398", "sent_id": 5}], "id": "train_001348", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-05-01 to 2026-05-09 on 2026-04-15.", "evidence": [{"doc_id": "doc_000114", "sent_id": 5}], "id": "train_001349", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata had a failed run with Nova-7B on Node Willow-05 because of an out-of-memory error on 2026-05-01.", "evidence": [{"doc_id": "doc_000264", "sent_id": 8}], "id": "train_001350", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-7B for error analysis on 2026-05-24.", "evidence": [{"doc_id": "doc_000363", "sent_id": 4}], "id": "train_001351", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Finch-3B achieved a higher latency efficiency score than Vela-3B.", "evidence": [{"doc_id": "doc_000078", "sent_id": 6}, {"doc_id": "doc_000043", "sent_id": 6}], "id": "train_001352", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-04-27 changed its method from evidence pooling to BM25 retrieval on 2026-06-07.", "evidence": [{"doc_id": "doc_000015", "sent_id": 2}, {"doc_id": "doc_000477", "sent_id": 4}], "id": "train_001353", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-06-27 that it used hard-negative mining and did not use a reward model.", "evidence": [], "id": "train_001354", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Nova-7B failed because of an out-of-memory error on 2026-05-15 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000303", "sent_id": 8}], "id": "train_001355", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "In the compared latency efficiency score runs, Atlas-7B achieved a higher latency efficiency score than Vela-3B.", "evidence": [{"doc_id": "doc_000452", "sent_id": 6}, {"doc_id": "doc_000043", "sent_id": 6}], "id": "train_001356", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Helix-8B achieved 0.943 macro F1 on OrionBench-2 for Project Aster on 2026-05-05.", "evidence": [{"doc_id": "doc_000416", "sent_id": 7}], "id": "train_001357", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-26 recorded macro F1 for Nova-3B on LabQA using Node Aspen-01 on 2026-05-25.", "evidence": [{"doc_id": "doc_000241", "sent_id": 7}, {"doc_id": "doc_000474", "sent_id": 2}], "id": "train_001358", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-06-09 that it used contrastive tuning and did not use a reward model.", "evidence": [], "id": "train_001359", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from contrastive tuning to data mixing on 2026-05-06.", "evidence": [{"doc_id": "doc_000173", "sent_id": 5}], "id": "train_001360", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Nova-3B failed because of an unstable-validation-loss error on 2026-05-28 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000486", "sent_id": 3}], "id": "train_001361", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from hybrid retrieval to document chunking on 2026-06-26.", "evidence": [], "id": "train_001362", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Quartz-3B for claim classification on 2026-06-18.", "evidence": [{"doc_id": "doc_000419", "sent_id": 4}], "id": "train_001363", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lena Costa was assigned as the data steward for Project Saffron on 2026-05-25.", "evidence": [{"doc_id": "doc_000262", "sent_id": 2}], "id": "train_001364", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Sofia Nadir was assigned as the retrieval owner for Project Sonata on 2026-06-08.", "evidence": [{"doc_id": "doc_000031", "sent_id": 2}], "id": "train_001365", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Cedar-3B for error analysis on 2026-05-07.", "evidence": [{"doc_id": "doc_000117", "sent_id": 3}], "id": "train_001366", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Helix-8B achieved 0.816 accuracy on CedarQA-3 for Project Anchor on 2026-04-06.", "evidence": [{"doc_id": "doc_000074", "sent_id": 1}], "id": "train_001367", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Atlas-8B achieved 0.714 macro F1 on RiverBench-2 for Project Nereid on 2026-06-17.", "evidence": [], "id": "train_001368", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-8B achieved 0.732 macro F1 on OrionBench-3 for Project Nereid on 2026-04-11.", "evidence": [{"doc_id": "doc_000255", "sent_id": 6}], "id": "train_001369", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron had a failed run with Marble-8B on Node Maple-01 because of a checkpoint-mismatch error on 2026-06-20.", "evidence": [], "id": "train_001370", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "River-7B achieved 0.828 latency efficiency score on SignalSet-2 for Project Anchor on 2026-04-14.", "evidence": [{"doc_id": "doc_000242", "sent_id": 7}], "id": "train_001371", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-05-20 to 2026-05-30 on 2026-05-06.", "evidence": [{"doc_id": "doc_000400", "sent_id": 5}], "id": "train_001372", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor's run with Orchid-7B failed on Node Cedar-02 because of an out-of-memory error on 2026-06-05.", "evidence": [{"doc_id": "doc_000054", "sent_id": 8}], "id": "train_001373", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Orchid-8B for reranking on 2026-06-27.", "evidence": [], "id": "train_001374", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Vector Lantern Study 4 reported on 2026-05-22 that it used late interaction and used a reward model.", "evidence": [{"doc_id": "doc_000087", "sent_id": 7}], "id": "train_001375", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Aster-3B achieved a higher evidence F1 than Marble-3B.", "evidence": [{"doc_id": "doc_000077", "sent_id": 6}, {"doc_id": "doc_000343", "sent_id": 3}], "id": "train_001376", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Lumen-3B achieved a higher accuracy than Nova-3B.", "evidence": [{"doc_id": "doc_000391", "sent_id": 7}, {"doc_id": "doc_000227", "sent_id": 1}], "id": "train_001377", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-26 had a failed run with Vela-3B on Node Fir-10 because of a checkpoint-mismatch error on 2026-06-27.", "evidence": [{"doc_id": "doc_000286", "sent_id": 7}, {"doc_id": "doc_000475", "sent_id": 6}], "id": "train_001378", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Atlas-7B for claim classification on 2026-06-26.", "evidence": [{"doc_id": "doc_000062", "sent_id": 8}], "id": "train_001379", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Camila Quinn was assigned as the evaluation owner for Project Meridian on 2026-04-27.", "evidence": [{"doc_id": "doc_000151", "sent_id": 2}], "id": "train_001380", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Lumen-3B achieved a higher latency efficiency score than River-3B.", "evidence": [{"doc_id": "doc_000303", "sent_id": 7}, {"doc_id": "doc_000055", "sent_id": 2}], "id": "train_001381", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Stable Chains Study 1 reported on 2026-05-16 that it used alias expansion and did not use a reward model.", "evidence": [], "id": "train_001382", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Cedar-3B achieved a higher macro F1 than Atlas-3B.", "evidence": [{"doc_id": "doc_000443", "sent_id": 4}, {"doc_id": "doc_000360", "sent_id": 6}], "id": "train_001383", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-06 selected Cedar-8B for error analysis on 2026-05-14.", "evidence": [{"doc_id": "doc_000114", "sent_id": 2}, {"doc_id": "doc_000138", "sent_id": 3}], "id": "train_001384", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Marble-7B for error analysis on 2026-06-01.", "evidence": [], "id": "train_001385", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-19 received 5 GPUs from Node Pine-07 on 2026-06-27.", "evidence": [{"doc_id": "doc_000316", "sent_id": 8}, {"doc_id": "doc_000120", "sent_id": 6}], "id": "train_001386", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-06-27 to 2026-07-07 on 2026-06-06.", "evidence": [], "id": "train_001387", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Atlas-8B achieved 0.729 accuracy on SignalSet-3 for Project Meridian on 2026-05-19.", "evidence": [], "id": "train_001388", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Vela-3B on Node Fir-10 because of a checkpoint-mismatch error on 2026-05-15.", "evidence": [{"doc_id": "doc_000173", "sent_id": 8}], "id": "train_001389", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Lumen-3B for calibration on 2026-06-13.", "evidence": [], "id": "train_001390", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Nova-7B for reranking on 2026-05-25.", "evidence": [], "id": "train_001391", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-12 recorded latency efficiency score for Vela-3B on LumenFacts-3 using Node Cedar-02 on 2026-05-18.", "evidence": [{"doc_id": "doc_000186", "sent_id": 3}, {"doc_id": "doc_000187", "sent_id": 2}], "id": "train_001392", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "River-7B achieved 0.625 macro F1 on RiverBench for Project Aster on 2026-06-09.", "evidence": [{"doc_id": "doc_000413", "sent_id": 6}], "id": "train_001393", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Anika Sato was assigned as the lead for Project Aster on 2026-05-12.", "evidence": [{"doc_id": "doc_000372", "sent_id": 7}], "id": "train_001394", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-06-24 to 2026-06-30 on 2026-06-11.", "evidence": [], "id": "train_001395", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-05-11 had a run with Lumen-3B on Node Spruce-03 that failed because of an unstable-validation-loss error on 2026-05-23.", "evidence": [{"doc_id": "doc_000286", "sent_id": 2}, {"doc_id": "doc_000320", "sent_id": 6}], "id": "train_001396", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor moved the Milestone T1 deadline from 2026-06-20 to 2026-06-30 on 2026-06-10.", "evidence": [{"doc_id": "doc_000463", "sent_id": 5}], "id": "train_001397", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Nimbus-3B achieved a higher macro F1 than Helix-7B.", "evidence": [{"doc_id": "doc_000463", "sent_id": 7}, {"doc_id": "doc_000069", "sent_id": 7}], "id": "train_001398", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-04-20 recorded latency efficiency score for Aster-3B on SignalSet using Node Juniper-06 on 2026-04-11.", "evidence": [{"doc_id": "doc_000050", "sent_id": 2}, {"doc_id": "doc_000292", "sent_id": 6}], "id": "train_001399", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-3B achieved 0.770 latency efficiency score on SignalSet-3 for Project Meridian on 2026-05-11.", "evidence": [{"doc_id": "doc_000321", "sent_id": 2}], "id": "train_001400", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-06-06 that it used teacher distillation and did not use a reward model.", "evidence": [], "id": "train_001401", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Marble-8B failed because of a missing-index error on 2026-04-10 while using Node Juniper-06.", "evidence": [{"doc_id": "doc_000139", "sent_id": 8}], "id": "train_001402", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid had a failed run with Quartz-7B on Node Aspen-01 because of an out-of-memory error on 2026-06-04.", "evidence": [{"doc_id": "doc_000034", "sent_id": 3}], "id": "train_001403", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Lumen-3B failed on Node Spruce-03 because of an unstable-validation-loss error on 2026-04-16.", "evidence": [{"doc_id": "doc_000024", "sent_id": 3}], "id": "train_001404", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from contrastive tuning to threshold search on 2026-06-07.", "evidence": [{"doc_id": "doc_000463", "sent_id": 4}], "id": "train_001405", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Finch-3B for error analysis on 2026-04-24.", "evidence": [{"doc_id": "doc_000271", "sent_id": 8}], "id": "train_001406", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-05-18 recorded macro F1 for Mira-3B on TraceEval-3 using Node Sycamore-13 on 2026-05-25.", "evidence": [{"doc_id": "doc_000256", "sent_id": 2}, {"doc_id": "doc_000353", "sent_id": 2}], "id": "train_001407", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Cedar-7B failed because of a checkpoint-mismatch error on 2026-04-23 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000263", "sent_id": 2}], "id": "train_001408", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-8B achieved 0.643 accuracy on MemoTrace-2 for Project Saffron on 2026-05-30.", "evidence": [{"doc_id": "doc_000177", "sent_id": 6}], "id": "train_001409", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Juniper-06 allocated 6 GPUs to Project Anchor on 2026-05-09.", "evidence": [{"doc_id": "doc_000173", "sent_id": 6}], "id": "train_001410", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Petra Adler was assigned as the data steward for Project Saffron on 2026-05-12.", "evidence": [{"doc_id": "doc_000412", "sent_id": 7}], "id": "train_001411", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-8B achieved 0.818 latency efficiency score on CedarQA for Project Meridian on 2026-05-04.", "evidence": [{"doc_id": "doc_000086", "sent_id": 2}], "id": "train_001412", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-01 selected Mira-3B for calibration on 2026-04-07.", "evidence": [{"doc_id": "doc_000093", "sent_id": 2}, {"doc_id": "doc_000188", "sent_id": 7}], "id": "train_001413", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Mira-8B achieved 0.553 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-05-09.", "evidence": [{"doc_id": "doc_000372", "sent_id": 6}], "id": "train_001414", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-07-01 to 2026-07-09 on 2026-06-14.", "evidence": [], "id": "train_001415", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Atlas-7B for error analysis on 2026-04-12.", "evidence": [{"doc_id": "doc_000074", "sent_id": 3}], "id": "train_001416", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Owen Torres was assigned as the evaluation owner for Project Saffron on 2026-06-02.", "evidence": [{"doc_id": "doc_000476", "sent_id": 8}], "id": "train_001417", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-06-08 received 4 GPUs from Node Rowan-09 on 2026-06-13.", "evidence": [{"doc_id": "doc_000225", "sent_id": 2}, {"doc_id": "doc_000063", "sent_id": 6}], "id": "train_001418", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from BM25 retrieval to evidence pooling on 2026-06-14.", "evidence": [{"doc_id": "doc_000003", "sent_id": 4}], "id": "train_001419", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Lumen-3B for calibration on 2026-04-02.", "evidence": [{"doc_id": "doc_000112", "sent_id": 3}], "id": "train_001420", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Kestrel-3B achieved a higher evidence F1 than Mira-3B.", "evidence": [{"doc_id": "doc_000308", "sent_id": 8}, {"doc_id": "doc_000419", "sent_id": 7}], "id": "train_001421", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Aspen-01 allocated 2 GPUs to Project Meridian on 2026-04-19.", "evidence": [], "id": "train_001422", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Aster-3B achieved 0.544 evidence F1 on RiverBench-3 for Project Sonata on 2026-05-16.", "evidence": [{"doc_id": "doc_000448", "sent_id": 6}], "id": "train_001423", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid selected Cedar-3B for error analysis on 2026-05-28.", "evidence": [{"doc_id": "doc_000408", "sent_id": 4}], "id": "train_001424", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-04-27 selected River-7B for evidence retrieval on 2026-05-13.", "evidence": [{"doc_id": "doc_000165", "sent_id": 2}, {"doc_id": "doc_000399", "sent_id": 5}], "id": "train_001425", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Nova-3B for reranking on 2026-05-14.", "evidence": [{"doc_id": "doc_000326", "sent_id": 4}], "id": "train_001426", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Stable Chains Study 3 reported on 2026-05-14 that it used BM25 retrieval and did not use a reward model.", "evidence": [], "id": "train_001427", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor had a failed run with Marble-8B on Node Poplar-12 because of a missing-index error on 2026-06-19.", "evidence": [{"doc_id": "doc_000356", "sent_id": 8}], "id": "train_001428", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-05-26 moved the Milestone V1 deadline from 2026-05-01 to 2026-05-09 on 2026-04-15.", "evidence": [{"doc_id": "doc_000101", "sent_id": 7}, {"doc_id": "doc_000114", "sent_id": 5}], "id": "train_001429", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-04-07 had a run with Kestrel-7B that failed because of a checkpoint-mismatch error on 2026-06-11 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000300", "sent_id": 7}, {"doc_id": "doc_000144", "sent_id": 2}], "id": "train_001430", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Finch-3B for error analysis on 2026-04-30.", "evidence": [{"doc_id": "doc_000307", "sent_id": 3}], "id": "train_001431", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Lumen-7B achieved a higher macro F1 than Atlas-7B.", "evidence": [{"doc_id": "doc_000462", "sent_id": 6}, {"doc_id": "doc_000285", "sent_id": 6}], "id": "train_001432", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata had a failed run with Mira-7B on Node Sycamore-13 because of an unstable-validation-loss error on 2026-04-19.", "evidence": [], "id": "train_001433", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid changed its method from QLoRA adaptation to hybrid retrieval on 2026-04-09.", "evidence": [], "id": "train_001434", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Nova-7B failed on Node Aspen-01 because of a checkpoint-mismatch error on 2026-05-15.", "evidence": [{"doc_id": "doc_000303", "sent_id": 8}], "id": "train_001435", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Finch-7B for error analysis on 2026-06-12.", "evidence": [{"doc_id": "doc_000408", "sent_id": 10}], "id": "train_001436", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Kestrel-8B for claim classification on 2026-04-02.", "evidence": [{"doc_id": "doc_000414", "sent_id": 4}], "id": "train_001437", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with Finch-8B failed on Node Juniper-06 because of a missing-index error on 2026-05-15.", "evidence": [{"doc_id": "doc_000230", "sent_id": 8}], "id": "train_001438", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-06-28 had a run with Lumen-8B on Node Spruce-03 that failed because of an unstable-validation-loss error on 2026-06-26.", "evidence": [{"doc_id": "doc_000442", "sent_id": 4}, {"doc_id": "doc_000201", "sent_id": 8}], "id": "train_001439", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nimbus-8B achieved 0.824 latency efficiency score on MemoTrace for Project Meridian on 2026-06-16.", "evidence": [{"doc_id": "doc_000218", "sent_id": 7}], "id": "train_001440", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Felix Brooks was assigned as the retrieval owner for Project Aster on 2026-04-27.", "evidence": [{"doc_id": "doc_000412", "sent_id": 2}], "id": "train_001441", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Aster-7B failed on Node Willow-05 because of a checkpoint-mismatch error on 2026-06-25.", "evidence": [{"doc_id": "doc_000422", "sent_id": 3}], "id": "train_001442", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-06-08 that it used contrastive tuning and used a reward model.", "evidence": [{"doc_id": "doc_000152", "sent_id": 2}], "id": "train_001443", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster had a failed run with Vela-3B on Node Poplar-12 because of an out-of-memory error on 2026-05-21.", "evidence": [{"doc_id": "doc_000187", "sent_id": 3}], "id": "train_001444", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Helix-3B for claim classification on 2026-06-18.", "evidence": [{"doc_id": "doc_000288", "sent_id": 4}], "id": "train_001445", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-3B for evidence retrieval on 2026-05-23.", "evidence": [], "id": "train_001446", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Birch-04 allocated 3 GPUs to Project Nereid on 2026-06-13.", "evidence": [{"doc_id": "doc_000007", "sent_id": 6}], "id": "train_001447", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid's run with Cedar-3B failed because of an unstable-validation-loss error on 2026-04-16 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000254", "sent_id": 3}], "id": "train_001448", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 2 reported on 2026-05-02 that it used threshold search and did not use a reward model.", "evidence": [], "id": "train_001449", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian had a failed run with Nimbus-8B on Node Hazel-14 because of a checkpoint-mismatch error on 2026-05-14.", "evidence": [{"doc_id": "doc_000337", "sent_id": 3}], "id": "train_001450", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Atlas-7B for reranking on 2026-04-12.", "evidence": [{"doc_id": "doc_000455", "sent_id": 4}], "id": "train_001451", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Cedar-7B failed because of an unstable-validation-loss error on 2026-06-06 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000349", "sent_id": 6}], "id": "train_001452", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor's run with Lumen-3B failed on Node Spruce-03 because of a checkpoint-mismatch error on 2026-05-21.", "evidence": [{"doc_id": "doc_000033", "sent_id": 3}], "id": "train_001453", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-19 recorded evidence F1 for Quartz-7B on OrionBench using Node Pine-07 on 2026-05-26.", "evidence": [{"doc_id": "doc_000316", "sent_id": 8}, {"doc_id": "doc_000141", "sent_id": 6}], "id": "train_001454", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Rowan-09 allocated 4 GPUs to Project Anchor on 2026-05-06.", "evidence": [], "id": "train_001455", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-04-07 received 3 GPUs from Node Hazel-14 on 2026-04-24.", "evidence": [{"doc_id": "doc_000119", "sent_id": 9}, {"doc_id": "doc_000294", "sent_id": 7}], "id": "train_001456", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from teacher distillation to rank fusion on 2026-05-27.", "evidence": [{"doc_id": "doc_000187", "sent_id": 5}], "id": "train_001457", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone Z1 deadline from 2026-05-17 to 2026-05-27 on 2026-04-29.", "evidence": [{"doc_id": "doc_000285", "sent_id": 5}], "id": "train_001458", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Cedar-7B achieved a higher macro F1 than Nova-7B.", "evidence": [{"doc_id": "doc_000215", "sent_id": 2}, {"doc_id": "doc_000187", "sent_id": 7}], "id": "train_001459", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Lumen-7B achieved 0.833 latency efficiency score on MemoTrace-3 for Project Anchor on 2026-06-13.", "evidence": [{"doc_id": "doc_000481", "sent_id": 6}], "id": "train_001460", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Elian Ames was assigned as the evaluation owner on 2026-05-04 selected Marble-7B for calibration on 2026-06-28.", "evidence": [{"doc_id": "doc_000018", "sent_id": 2}, {"doc_id": "doc_000139", "sent_id": 4}], "id": "train_001461", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-06-29 changed its method from late interaction to BM25 retrieval on 2026-06-07.", "evidence": [{"doc_id": "doc_000209", "sent_id": 2}, {"doc_id": "doc_000459", "sent_id": 4}], "id": "train_001462", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Evan Moss was assigned as the lead for Project Nereid on 2026-06-28.", "evidence": [], "id": "train_001463", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid had a failed run with Nova-7B on Node Elm-08 because of an out-of-memory error on 2026-05-14.", "evidence": [{"doc_id": "doc_000344", "sent_id": 2}], "id": "train_001464", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-05-05 moved the Milestone D1 deadline from 2026-07-03 to 2026-07-09 on 2026-06-11.", "evidence": [{"doc_id": "doc_000260", "sent_id": 7}, {"doc_id": "doc_000042", "sent_id": 3}], "id": "train_001465", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Owen Torres was assigned as the evaluation owner for Project Saffron on 2026-05-25.", "evidence": [{"doc_id": "doc_000098", "sent_id": 2}], "id": "train_001466", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Kestrel-8B achieved a higher accuracy than Cedar-8B.", "evidence": [{"doc_id": "doc_000143", "sent_id": 1}, {"doc_id": "doc_000194", "sent_id": 7}], "id": "train_001467", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-04-14 selected Marble-8B for error analysis on 2026-04-10.", "evidence": [{"doc_id": "doc_000292", "sent_id": 7}, {"doc_id": "doc_000308", "sent_id": 10}], "id": "train_001468", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Aster-3B achieved a higher latency efficiency score than Orchid-8B.", "evidence": [{"doc_id": "doc_000292", "sent_id": 6}, {"doc_id": "doc_000119", "sent_id": 8}], "id": "train_001469", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor's run with River-7B failed on Node Rowan-09 because of a missing-index error on 2026-04-17.", "evidence": [{"doc_id": "doc_000484", "sent_id": 8}], "id": "train_001470", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Atlas-8B failed because of an out-of-memory error on 2026-04-10 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000422", "sent_id": 8}], "id": "train_001471", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared evidence F1 runs, Cedar-7B achieved a higher evidence F1 than Nova-7B.", "evidence": [{"doc_id": "doc_000302", "sent_id": 2}, {"doc_id": "doc_000385", "sent_id": 7}], "id": "train_001472", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Vela-7B achieved 0.580 evidence F1 on LabQA-2 for Project Sonata on 2026-04-19.", "evidence": [], "id": "train_001473", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Vela-3B for reranking on 2026-05-01.", "evidence": [{"doc_id": "doc_000234", "sent_id": 8}], "id": "train_001474", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Kira Frost was assigned as the data steward for Project Anchor on 2026-04-29.", "evidence": [], "id": "train_001475", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Marble-7B achieved 0.687 evidence F1 on NereidNotes for Project Sonata on 2026-05-25.", "evidence": [{"doc_id": "doc_000140", "sent_id": 2}], "id": "train_001476", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone P1 deadline from 2026-04-17 to 2026-04-25 on 2026-04-08.", "evidence": [], "id": "train_001477", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Mira-8B achieved a higher latency efficiency score than Orchid-7B.", "evidence": [{"doc_id": "doc_000002", "sent_id": 7}, {"doc_id": "doc_000312", "sent_id": 2}], "id": "train_001478", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Camila Brooks was assigned as the data steward for Project Meridian on 2026-04-28.", "evidence": [{"doc_id": "doc_000234", "sent_id": 7}], "id": "train_001479", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-05-26 selected Atlas-7B for reranking on 2026-04-09.", "evidence": [{"doc_id": "doc_000138", "sent_id": 7}, {"doc_id": "doc_000190", "sent_id": 3}], "id": "train_001480", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-04-21 selected Lumen-7B for calibration on 2026-06-03.", "evidence": [{"doc_id": "doc_000190", "sent_id": 7}, {"doc_id": "doc_000160", "sent_id": 5}], "id": "train_001481", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Juniper-06 allocated 4 GPUs to Project Anchor on 2026-05-16.", "evidence": [{"doc_id": "doc_000134", "sent_id": 6}], "id": "train_001482", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared evidence F1 runs, Orchid-8B achieved a higher evidence F1 than Aster-3B.", "evidence": [{"doc_id": "doc_000015", "sent_id": 6}, {"doc_id": "doc_000448", "sent_id": 6}], "id": "train_001483", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nimbus-7B achieved 0.738 evidence F1 on TraceEval-2 for Project Nereid on 2026-06-06.", "evidence": [{"doc_id": "doc_000375", "sent_id": 6}], "id": "train_001484", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Talia Marin was assigned as the lead on 2026-06-08 selected Kestrel-7B for reranking on 2026-04-19.", "evidence": [{"doc_id": "doc_000259", "sent_id": 2}, {"doc_id": "doc_000404", "sent_id": 4}], "id": "train_001485", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-04-20 moved the Milestone L1 deadline to 2026-04-27 on 2026-04-01.", "evidence": [{"doc_id": "doc_000260", "sent_id": 2}, {"doc_id": "doc_000119", "sent_id": 6}], "id": "train_001486", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-12 recorded latency efficiency score for Quartz-8B on CedarQA-2 using Node Pine-07 on 2026-04-04.", "evidence": [{"doc_id": "doc_000186", "sent_id": 3}, {"doc_id": "doc_000103", "sent_id": 8}], "id": "train_001487", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Aster-8B for evidence retrieval on 2026-04-03.", "evidence": [{"doc_id": "doc_000231", "sent_id": 4}], "id": "train_001488", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared latency efficiency score runs, Quartz-7B achieved a higher latency efficiency score than Marble-7B.", "evidence": [{"doc_id": "doc_000135", "sent_id": 2}, {"doc_id": "doc_000287", "sent_id": 7}], "id": "train_001489", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron's run with Marble-3B failed on Node Maple-01 because of an out-of-memory error on 2026-04-03.", "evidence": [{"doc_id": "doc_000125", "sent_id": 8}], "id": "train_001490", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-16 had a run with Lumen-8B on Node Spruce-03 that failed because of an unstable-validation-loss error on 2026-04-02.", "evidence": [{"doc_id": "doc_000338", "sent_id": 7}, {"doc_id": "doc_000097", "sent_id": 3}], "id": "train_001491", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Lattice Memory Study 4 reported on 2026-06-01 that it used rank fusion and did not use a reward model.", "evidence": [{"doc_id": "doc_000172", "sent_id": 1}], "id": "train_001492", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-05-23 to 2026-05-27 on 2026-04-30.", "evidence": [], "id": "train_001493", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Nimbus-8B for calibration on 2026-06-13.", "evidence": [], "id": "train_001494", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Nova-3B achieved a higher accuracy than Cedar-3B.", "evidence": [{"doc_id": "doc_000122", "sent_id": 7}, {"doc_id": "doc_000130", "sent_id": 2}], "id": "train_001495", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Maple-01 allocated 3 GPUs to Project Sonata on 2026-05-24.", "evidence": [], "id": "train_001496", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Iris Stone was assigned as the data steward for Project Anchor on 2026-04-21.", "evidence": [{"doc_id": "doc_000114", "sent_id": 7}], "id": "train_001497", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-04-18 that it used query rewriting and did not use a reward model.", "evidence": [], "id": "train_001498", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor had a failed run with Helix-3B on Node Rowan-09 because of a missing-index error on 2026-04-24.", "evidence": [{"doc_id": "doc_000074", "sent_id": 7}], "id": "train_001499", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Lind was assigned as the evaluation owner on 2026-05-19 moved the Milestone D2 deadline to 2026-05-01 on 2026-04-21.", "evidence": [{"doc_id": "doc_000023", "sent_id": 7}, {"doc_id": "doc_000275", "sent_id": 7}], "id": "train_001500", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from QLoRA adaptation to hybrid retrieval on 2026-04-10.", "evidence": [], "id": "train_001501", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Aster-8B achieved 0.574 macro F1 on RiverBench-3 for Project Sonata on 2026-05-23.", "evidence": [{"doc_id": "doc_000030", "sent_id": 6}], "id": "train_001502", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-7B for evidence retrieval on 2026-06-20.", "evidence": [], "id": "train_001503", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Cedar-02 allocated 4 GPUs to Project Anchor on 2026-05-30.", "evidence": [{"doc_id": "doc_000363", "sent_id": 6}], "id": "train_001504", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Orchid-7B for reranking on 2026-06-26.", "evidence": [{"doc_id": "doc_000003", "sent_id": 8}], "id": "train_001505", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 6 GPUs to Project Anchor on 2026-04-18.", "evidence": [{"doc_id": "doc_000455", "sent_id": 6}], "id": "train_001506", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-06-16 moved the Milestone D2 deadline to 2026-05-12 on 2026-04-17.", "evidence": [{"doc_id": "doc_000378", "sent_id": 7}, {"doc_id": "doc_000211", "sent_id": 8}], "id": "train_001507", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-05-19 that it used calibrated voting and did not use a reward model.", "evidence": [], "id": "train_001508", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-04-21 selected Helix-3B for error analysis on 2026-05-03.", "evidence": [{"doc_id": "doc_000114", "sent_id": 7}, {"doc_id": "doc_000252", "sent_id": 4}], "id": "train_001509", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Amber Ranking Study 4 reported on 2026-05-07 that it used late interaction and did not use a reward model.", "evidence": [], "id": "train_001510", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-06-16 selected Lumen-3B for reranking on 2026-05-20.", "evidence": [{"doc_id": "doc_000461", "sent_id": 7}, {"doc_id": "doc_000104", "sent_id": 5}], "id": "train_001511", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Marble-7B achieved a higher latency efficiency score than Lumen-7B.", "evidence": [{"doc_id": "doc_000287", "sent_id": 7}, {"doc_id": "doc_000034", "sent_id": 2}], "id": "train_001512", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian changed its method from sentence pruning to cross-encoder reranking on 2026-04-27.", "evidence": [], "id": "train_001513", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-19 recorded accuracy for Kestrel-8B on CedarQA using Node Birch-04 on 2026-06-06.", "evidence": [{"doc_id": "doc_000016", "sent_id": 4}, {"doc_id": "doc_000408", "sent_id": 8}], "id": "train_001514", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Marble-8B achieved 0.541 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-06-23.", "evidence": [], "id": "train_001515", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Helix-8B achieved 0.872 macro F1 on OrionBench-2 for Project Aster on 2026-05-06.", "evidence": [], "id": "train_001516", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nova-7B achieved 0.836 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-04-07.", "evidence": [{"doc_id": "doc_000281", "sent_id": 6}], "id": "train_001517", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-7B achieved 0.717 latency efficiency score on CedarQA for Project Meridian on 2026-04-15.", "evidence": [], "id": "train_001518", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Aster-8B for evidence retrieval on 2026-04-24.", "evidence": [{"doc_id": "doc_000490", "sent_id": 8}], "id": "train_001519", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-3B achieved 0.696 latency efficiency score on LumenFacts-3 for Project Saffron on 2026-05-18.", "evidence": [{"doc_id": "doc_000187", "sent_id": 2}], "id": "train_001520", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-06-04 on 2026-05-02.", "evidence": [], "id": "train_001521", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Cedar-7B achieved 0.741 accuracy on VestaLogs-2 for Project Meridian on 2026-04-04.", "evidence": [{"doc_id": "doc_000355", "sent_id": 8}], "id": "train_001522", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Cedar-8B for error analysis on 2026-04-09.", "evidence": [{"doc_id": "doc_000271", "sent_id": 3}], "id": "train_001523", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-7B for claim classification on 2026-05-07.", "evidence": [{"doc_id": "doc_000284", "sent_id": 3}], "id": "train_001524", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron had a failed run with Marble-3B on Node Maple-01 because of a checkpoint-mismatch error on 2026-05-16.", "evidence": [{"doc_id": "doc_000168", "sent_id": 6}], "id": "train_001525", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Aster-8B achieved 0.735 macro F1 on RiverBench-3 for Project Sonata on 2026-05-05.", "evidence": [{"doc_id": "doc_000397", "sent_id": 7}], "id": "train_001526", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Nested Verifier Study 4 reported on 2026-04-21 that it used cross-encoder reranking and did not use a reward model.", "evidence": [], "id": "train_001527", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Lumen-7B for calibration on 2026-04-27.", "evidence": [], "id": "train_001528", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Noah Vale was assigned as the lead for Project Aster on 2026-06-29.", "evidence": [{"doc_id": "doc_000255", "sent_id": 2}], "id": "train_001529", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-05-16 that it used hybrid retrieval and did not use a reward model.", "evidence": [], "id": "train_001530", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-07-01 on 2026-06-10.", "evidence": [{"doc_id": "doc_000352", "sent_id": 5}], "id": "train_001531", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-07-10 to 2026-07-18 on 2026-06-24.", "evidence": [{"doc_id": "doc_000118", "sent_id": 5}], "id": "train_001532", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-8B achieved 0.848 latency efficiency score on CedarQA for Project Meridian on 2026-04-28.", "evidence": [{"doc_id": "doc_000264", "sent_id": 7}], "id": "train_001533", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Nova-8B achieved a higher accuracy than Lumen-8B.", "evidence": [{"doc_id": "doc_000328", "sent_id": 1}, {"doc_id": "doc_000489", "sent_id": 7}], "id": "train_001534", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected Helix-8B for claim classification on 2026-06-06.", "evidence": [], "id": "train_001535", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Helix-3B for claim classification on 2026-04-20.", "evidence": [], "id": "train_001536", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-06-30 had a failed run with Cedar-3B on Node Laurel-11 because of an unstable-validation-loss error on 2026-06-11.", "evidence": [{"doc_id": "doc_000465", "sent_id": 7}, {"doc_id": "doc_000192", "sent_id": 3}], "id": "train_001537", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Nimbus-3B achieved 0.588 evidence F1 on TraceEval-2 for Project Nereid on 2026-06-24.", "evidence": [], "id": "train_001538", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Hazel-14 allocated 1 GPU to Project Nereid on 2026-04-05.", "evidence": [], "id": "train_001539", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Atlas-7B for calibration on 2026-04-19.", "evidence": [{"doc_id": "doc_000254", "sent_id": 4}], "id": "train_001540", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Marble-8B on Node Maple-01 because of a checkpoint-mismatch error on 2026-06-25.", "evidence": [{"doc_id": "doc_000318", "sent_id": 3}], "id": "train_001541", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Atlas-7B achieved 0.744 macro F1 on RiverBench-2 for Project Nereid on 2026-05-02.", "evidence": [{"doc_id": "doc_000285", "sent_id": 6}], "id": "train_001542", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from late interaction to BM25 retrieval on 2026-05-04.", "evidence": [], "id": "train_001543", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-21 had a run with Nimbus-8B that failed because of an out-of-memory error on 2026-04-24 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000340", "sent_id": 3}, {"doc_id": "doc_000302", "sent_id": 8}], "id": "train_001544", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-19 had a run with Helix-7B on Node Poplar-12 that failed because of a missing-index error on 2026-06-19.", "evidence": [{"doc_id": "doc_000495", "sent_id": 6}, {"doc_id": "doc_000356", "sent_id": 8}], "id": "train_001545", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Lena Sol was assigned as the evaluation owner for Project Saffron on 2026-06-17.", "evidence": [], "id": "train_001546", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Finch-8B achieved 0.851 accuracy on VestaLogs for Project Anchor on 2026-05-30.", "evidence": [{"doc_id": "doc_000403", "sent_id": 6}], "id": "train_001547", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Rowan-09 allocated 4 GPUs to Project Anchor on 2026-04-25.", "evidence": [{"doc_id": "doc_000024", "sent_id": 6}], "id": "train_001548", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone Z1 deadline from 2026-05-03 to 2026-05-09 on 2026-04-15.", "evidence": [{"doc_id": "doc_000019", "sent_id": 5}], "id": "train_001549", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-05-11 recorded macro F1 for Lumen-3B on TraceEval using Node Pine-07 on 2026-04-25.", "evidence": [{"doc_id": "doc_000286", "sent_id": 2}, {"doc_id": "doc_000091", "sent_id": 6}], "id": "train_001550", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from cross-encoder reranking to late interaction on 2026-06-28.", "evidence": [{"doc_id": "doc_000119", "sent_id": 4}], "id": "train_001551", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-04-14 moved the Milestone Z1 deadline to 2026-05-04 on 2026-04-16.", "evidence": [{"doc_id": "doc_000047", "sent_id": 7}, {"doc_id": "doc_000039", "sent_id": 3}], "id": "train_001552", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Finch-7B achieved 0.922 accuracy on VestaLogs for Project Anchor on 2026-05-04.", "evidence": [{"doc_id": "doc_000041", "sent_id": 2}], "id": "train_001553", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Maple-01 allocated 4 GPUs to Project Saffron on 2026-05-16.", "evidence": [], "id": "train_001554", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-3B for reranking on 2026-06-01.", "evidence": [], "id": "train_001555", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared latency efficiency score runs, Mira-7B achieved a higher latency efficiency score than Orchid-3B.", "evidence": [{"doc_id": "doc_000311", "sent_id": 7}, {"doc_id": "doc_000079", "sent_id": 2}], "id": "train_001556", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron selected Vela-7B for reranking on 2026-06-04.", "evidence": [{"doc_id": "doc_000461", "sent_id": 3}], "id": "train_001557", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kira Iyer was assigned as the evaluation owner for Project Anchor on 2026-04-22.", "evidence": [], "id": "train_001558", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Vela-8B failed because of a missing-index error on 2026-05-22 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000134", "sent_id": 8}], "id": "train_001559", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Iris Stone was assigned as the data steward for Project Anchor on 2026-04-13.", "evidence": [{"doc_id": "doc_000268", "sent_id": 2}], "id": "train_001560", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Nimbus-3B achieved a higher latency efficiency score than Helix-7B.", "evidence": [{"doc_id": "doc_000425", "sent_id": 6}, {"doc_id": "doc_000480", "sent_id": 6}], "id": "train_001561", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared evidence F1 runs, Orchid-8B achieved a higher evidence F1 than River-8B.", "evidence": [{"doc_id": "doc_000466", "sent_id": 2}, {"doc_id": "doc_000006", "sent_id": 6}], "id": "train_001562", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian moved the Milestone J1 deadline from 2026-05-27 to 2026-05-31 on 2026-05-06.", "evidence": [], "id": "train_001563", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Aster-8B achieved 0.660 latency efficiency score on SignalSet for Project Saffron on 2026-06-15.", "evidence": [{"doc_id": "doc_000125", "sent_id": 2}], "id": "train_001564", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-7B achieved 0.833 macro F1 on OrionBench-3 for Project Nereid on 2026-06-09.", "evidence": [{"doc_id": "doc_000207", "sent_id": 7}], "id": "train_001565", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-3B achieved 0.610 macro F1 on LabQA-2 for Project Sonata on 2026-04-25.", "evidence": [{"doc_id": "doc_000073", "sent_id": 6}], "id": "train_001566", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Kestrel-7B for claim classification on 2026-04-03.", "evidence": [{"doc_id": "doc_000419", "sent_id": 3}], "id": "train_001567", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Quartz-3B for claim classification on 2026-05-09.", "evidence": [], "id": "train_001568", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Selene Rios was assigned as the data steward for Project Anchor on 2026-05-19.", "evidence": [{"doc_id": "doc_000448", "sent_id": 7}], "id": "train_001569", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-04-07 selected Quartz-8B for evidence retrieval on 2026-04-16.", "evidence": [{"doc_id": "doc_000119", "sent_id": 9}, {"doc_id": "doc_000268", "sent_id": 3}], "id": "train_001570", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-3B achieved 0.815 macro F1 on RiverBench-2 for Project Nereid on 2026-04-06.", "evidence": [{"doc_id": "doc_000026", "sent_id": 2}], "id": "train_001571", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Maple-01 allocated 3 GPUs to Project Sonata on 2026-04-26.", "evidence": [], "id": "train_001572", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-05-05 changed its method from rank fusion to dense retrieval on 2026-04-05.", "evidence": [{"doc_id": "doc_000133", "sent_id": 7}, {"doc_id": "doc_000048", "sent_id": 4}], "id": "train_001573", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Lumen-3B for calibration on 2026-04-28.", "evidence": [], "id": "train_001574", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nova-3B achieved 0.821 macro F1 on LabQA for Project Nereid on 2026-05-19.", "evidence": [{"doc_id": "doc_000450", "sent_id": 7}], "id": "train_001575", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Cedar-7B failed because of an unstable-validation-loss error on 2026-06-07 while using Node Laurel-11.", "evidence": [], "id": "train_001576", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Nova-3B for reranking on 2026-04-03.", "evidence": [{"doc_id": "doc_000309", "sent_id": 9}], "id": "train_001577", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Kira Iyer was assigned as the evaluation owner for Project Anchor on 2026-06-22.", "evidence": [{"doc_id": "doc_000355", "sent_id": 2}], "id": "train_001578", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-05-12 moved the Milestone H1 deadline to 2026-05-25 on 2026-05-11.", "evidence": [{"doc_id": "doc_000394", "sent_id": 7}, {"doc_id": "doc_000136", "sent_id": 2}], "id": "train_001579", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Owen Marin was assigned as the data steward for Project Saffron on 2026-04-15.", "evidence": [], "id": "train_001580", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Spruce-03 allocated 4 GPUs to Project Anchor on 2026-05-23.", "evidence": [{"doc_id": "doc_000213", "sent_id": 6}], "id": "train_001581", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata selected Atlas-7B for calibration on 2026-05-31.", "evidence": [{"doc_id": "doc_000493", "sent_id": 4}], "id": "train_001582", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-06-23 selected Kestrel-8B for evidence retrieval on 2026-04-09.", "evidence": [{"doc_id": "doc_000259", "sent_id": 7}, {"doc_id": "doc_000114", "sent_id": 3}], "id": "train_001583", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-07-08 to 2026-07-18 on 2026-06-24.", "evidence": [{"doc_id": "doc_000309", "sent_id": 6}], "id": "train_001584", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Finch-8B achieved a higher macro F1 than Kestrel-7B.", "evidence": [{"doc_id": "doc_000123", "sent_id": 6}, {"doc_id": "doc_000248", "sent_id": 6}], "id": "train_001585", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-06-24 to 2026-07-04 on 2026-05-30.", "evidence": [], "id": "train_001586", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-04-20 changed its method from metric smoothing to document chunking on 2026-05-24.", "evidence": [{"doc_id": "doc_000368", "sent_id": 2}, {"doc_id": "doc_000331", "sent_id": 4}], "id": "train_001587", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kestrel-8B achieved 0.773 evidence F1 on OrionBench-3 for Project Nereid on 2026-06-08.", "evidence": [{"doc_id": "doc_000032", "sent_id": 2}], "id": "train_001588", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Helix-7B for claim classification on 2026-06-05.", "evidence": [{"doc_id": "doc_000476", "sent_id": 9}], "id": "train_001589", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Maple-01 allocated 1 GPU to Project Sonata on 2026-06-13.", "evidence": [], "id": "train_001590", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor's run with River-3B failed on Node Rowan-09 because of a checkpoint-mismatch error on 2026-05-28.", "evidence": [{"doc_id": "doc_000438", "sent_id": 2}], "id": "train_001591", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Cedar-02 allocated 1 GPU to Project Aster on 2026-06-13.", "evidence": [], "id": "train_001592", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-03 recorded evidence F1 for Helix-7B on OrionBench-2 using Node Pine-07 on 2026-05-18.", "evidence": [{"doc_id": "doc_000485", "sent_id": 3}, {"doc_id": "doc_000067", "sent_id": 2}], "id": "train_001593", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected River-3B for evidence retrieval on 2026-06-25.", "evidence": [{"doc_id": "doc_000119", "sent_id": 3}], "id": "train_001594", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster had a failed run with Atlas-7B on Node Rowan-09 because of an unstable-validation-loss error on 2026-05-29.", "evidence": [{"doc_id": "doc_000337", "sent_id": 8}], "id": "train_001595", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Hazel-14 allocated 4 GPUs to Project Meridian on 2026-05-16.", "evidence": [{"doc_id": "doc_000041", "sent_id": 6}], "id": "train_001596", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Nimbus-3B for calibration on 2026-05-29.", "evidence": [{"doc_id": "doc_000030", "sent_id": 8}], "id": "train_001597", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-04-20 changed its method from teacher distillation to rank fusion on 2026-05-21.", "evidence": [{"doc_id": "doc_000368", "sent_id": 2}, {"doc_id": "doc_000021", "sent_id": 3}], "id": "train_001598", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-3B for evidence retrieval on 2026-05-18.", "evidence": [], "id": "train_001599", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid had a failed run with Aster-3B on Node Birch-04 because of an unstable-validation-loss error on 2026-06-18.", "evidence": [{"doc_id": "doc_000488", "sent_id": 2}], "id": "train_001600", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from teacher distillation to query rewriting on 2026-06-21.", "evidence": [{"doc_id": "doc_000248", "sent_id": 4}], "id": "train_001601", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-30 recorded evidence F1 for Cedar-7B on NereidNotes-3 using Node Laurel-11 on 2026-05-31.", "evidence": [{"doc_id": "doc_000288", "sent_id": 10}, {"doc_id": "doc_000438", "sent_id": 3}], "id": "train_001602", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-06-09 recorded evidence F1 for Atlas-8B on RiverBench-2 using Node Juniper-06 on 2026-04-13.", "evidence": [{"doc_id": "doc_000113", "sent_id": 6}, {"doc_id": "doc_000269", "sent_id": 2}], "id": "train_001603", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-04-07 received 3 GPUs from Node Cedar-02 on 2026-04-18.", "evidence": [{"doc_id": "doc_000103", "sent_id": 9}, {"doc_id": "doc_000229", "sent_id": 6}], "id": "train_001604", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-07-13 to 2026-07-23 on 2026-06-17.", "evidence": [{"doc_id": "doc_000259", "sent_id": 5}], "id": "train_001605", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster had a failed run with Helix-8B on Node Cedar-02 because of an out-of-memory error on 2026-04-16.", "evidence": [{"doc_id": "doc_000479", "sent_id": 3}], "id": "train_001606", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-06-22 moved the Milestone P1 deadline from 2026-07-21 to 2026-07-23 on 2026-06-30.", "evidence": [{"doc_id": "doc_000365", "sent_id": 2}, {"doc_id": "doc_000432", "sent_id": 7}], "id": "train_001607", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected River-3B for evidence retrieval on 2026-05-18.", "evidence": [], "id": "train_001608", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-8B achieved 0.827 evidence F1 on NereidNotes-3 for Project Nereid on 2026-04-14.", "evidence": [{"doc_id": "doc_000097", "sent_id": 7}], "id": "train_001609", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Finch-7B achieved 0.810 accuracy on VestaLogs for Project Anchor on 2026-04-28.", "evidence": [{"doc_id": "doc_000404", "sent_id": 7}], "id": "train_001610", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-04-21 moved the Milestone L1 deadline to 2026-05-21 on 2026-05-15.", "evidence": [{"doc_id": "doc_000145", "sent_id": 7}, {"doc_id": "doc_000129", "sent_id": 8}], "id": "train_001611", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-7B achieved 0.593 accuracy on MemoTrace-3 for Project Anchor on 2026-04-06.", "evidence": [], "id": "train_001612", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-08 selected Cedar-8B for error analysis on 2026-05-24.", "evidence": [{"doc_id": "doc_000062", "sent_id": 2}, {"doc_id": "doc_000418", "sent_id": 4}], "id": "train_001613", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Anika Sato was assigned as the lead for Project Aster on 2026-05-26.", "evidence": [{"doc_id": "doc_000214", "sent_id": 7}], "id": "train_001614", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Nimbus-8B for calibration on 2026-04-16.", "evidence": [{"doc_id": "doc_000402", "sent_id": 3}], "id": "train_001615", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-06-28 recorded macro F1 for River-8B on RiverBench using Node Rowan-09 on 2026-04-11.", "evidence": [{"doc_id": "doc_000442", "sent_id": 4}, {"doc_id": "doc_000112", "sent_id": 6}], "id": "train_001616", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Nova-7B for reranking on 2026-05-07.", "evidence": [{"doc_id": "doc_000426", "sent_id": 3}], "id": "train_001617", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from structured prompting to chain verification on 2026-05-13.", "evidence": [{"doc_id": "doc_000041", "sent_id": 5}], "id": "train_001618", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Marble-8B achieved 0.721 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-05-18.", "evidence": [], "id": "train_001619", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Owen Torres was assigned as the evaluation owner for Project Saffron on 2026-06-16.", "evidence": [{"doc_id": "doc_000463", "sent_id": 9}], "id": "train_001620", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Marble-7B achieved 0.717 evidence F1 on NereidNotes for Project Sonata on 2026-05-19.", "evidence": [{"doc_id": "doc_000041", "sent_id": 7}], "id": "train_001621", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Noah Vale was assigned as the lead for Project Aster on 2026-06-15.", "evidence": [{"doc_id": "doc_000248", "sent_id": 2}], "id": "train_001622", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-04-07.", "evidence": [{"doc_id": "doc_000142", "sent_id": 7}], "id": "train_001623", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster selected Orchid-8B for reranking on 2026-05-15.", "evidence": [{"doc_id": "doc_000400", "sent_id": 8}], "id": "train_001624", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Delta Evidence Study 3 reported on 2026-05-01 that it used chain verification and used a reward model.", "evidence": [{"doc_id": "doc_000028", "sent_id": 8}], "id": "train_001625", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Lumen-8B for calibration on 2026-05-11.", "evidence": [], "id": "train_001626", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Anika Costa was assigned as the retrieval owner for Project Aster on 2026-05-20.", "evidence": [], "id": "train_001627", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-05-11 had a run with River-7B on Node Rowan-09 that failed because of an out-of-memory error on 2026-05-28.", "evidence": [{"doc_id": "doc_000286", "sent_id": 2}, {"doc_id": "doc_000207", "sent_id": 3}], "id": "train_001628", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from structured prompting to chain verification on 2026-04-22.", "evidence": [{"doc_id": "doc_000404", "sent_id": 5}], "id": "train_001629", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Aster-7B achieved 0.690 accuracy on SignalSet for Project Saffron on 2026-06-22.", "evidence": [{"doc_id": "doc_000139", "sent_id": 2}], "id": "train_001630", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-06-04 to 2026-06-14 on 2026-05-14.", "evidence": [], "id": "train_001631", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Lumen-8B achieved 0.803 accuracy on MemoTrace-3 for Project Anchor on 2026-06-06.", "evidence": [{"doc_id": "doc_000098", "sent_id": 6}], "id": "train_001632", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Willow-05 allocated 5 GPUs to Project Sonata on 2026-05-02.", "evidence": [{"doc_id": "doc_000387", "sent_id": 6}], "id": "train_001633", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mina Shah was assigned as the lead for Project Aster on 2026-04-27.", "evidence": [{"doc_id": "doc_000400", "sent_id": 2}], "id": "train_001634", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-04-14 had a run with Helix-7B on Node Poplar-12 that failed because of a checkpoint-mismatch error on 2026-05-15.", "evidence": [{"doc_id": "doc_000047", "sent_id": 7}, {"doc_id": "doc_000466", "sent_id": 8}], "id": "train_001635", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Orchid-3B for reranking on 2026-05-12.", "evidence": [], "id": "train_001636", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mina Torres was assigned as the evaluation owner for Project Meridian on 2026-06-16.", "evidence": [{"doc_id": "doc_000093", "sent_id": 7}], "id": "train_001637", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Marble-3B achieved 0.631 accuracy on VestaLogs-3 for Project Saffron on 2026-05-09.", "evidence": [{"doc_id": "doc_000151", "sent_id": 6}], "id": "train_001638", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 4 GPUs to Project Meridian on 2026-05-16.", "evidence": [], "id": "train_001639", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron's run with Mira-3B failed on Node Sycamore-13 because of an out-of-memory error on 2026-04-17.", "evidence": [{"doc_id": "doc_000097", "sent_id": 8}], "id": "train_001640", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata changed its method from QLoRA adaptation to structured prompting on 2026-04-01.", "evidence": [{"doc_id": "doc_000139", "sent_id": 5}], "id": "train_001641", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-05-23 that it used cross-encoder reranking and did not use a reward model.", "evidence": [], "id": "train_001642", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Finch-8B achieved 0.896 evidence F1 on NereidNotes-2 for Project Aster on 2026-04-26.", "evidence": [], "id": "train_001643", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-04-27 to 2026-05-01 on 2026-04-16.", "evidence": [], "id": "train_001644", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor changed its method from temporal filtering to alias expansion on 2026-05-04.", "evidence": [], "id": "train_001645", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Atlas-7B achieved a higher macro F1 than Vela-3B.", "evidence": [{"doc_id": "doc_000285", "sent_id": 6}, {"doc_id": "doc_000073", "sent_id": 6}], "id": "train_001646", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Willow-05 allocated 3 GPUs to Project Sonata on 2026-04-25.", "evidence": [{"doc_id": "doc_000404", "sent_id": 6}], "id": "train_001647", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-04-21 received 2 GPUs from Node Juniper-06 on 2026-05-21.", "evidence": [{"doc_id": "doc_000114", "sent_id": 7}, {"doc_id": "doc_000313", "sent_id": 2}], "id": "train_001648", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Finch-8B achieved 0.622 accuracy on VestaLogs for Project Anchor on 2026-05-12.", "evidence": [{"doc_id": "doc_000215", "sent_id": 7}], "id": "train_001649", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Aster-3B achieved 0.589 latency efficiency score on SignalSet for Project Saffron on 2026-04-11.", "evidence": [{"doc_id": "doc_000292", "sent_id": 6}], "id": "train_001650", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Selene Rios was assigned as the data steward for Project Anchor on 2026-05-23.", "evidence": [], "id": "train_001651", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-06-08 that it used contrastive tuning and did not use a reward model.", "evidence": [{"doc_id": "doc_000152", "sent_id": 2}], "id": "train_001652", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-04 received 4 GPUs from Node Juniper-06 on 2026-04-18.", "evidence": [{"doc_id": "doc_000316", "sent_id": 2}, {"doc_id": "doc_000189", "sent_id": 6}], "id": "train_001653", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-06-01 moved the Milestone T1 deadline from 2026-06-09 to 2026-06-17 on 2026-05-20.", "evidence": [{"doc_id": "doc_000378", "sent_id": 2}, {"doc_id": "doc_000056", "sent_id": 5}], "id": "train_001654", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Fir-10 allocated 5 GPUs to Project Sonata on 2026-04-04.", "evidence": [{"doc_id": "doc_000422", "sent_id": 6}], "id": "train_001655", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Nimbus-7B for calibration on 2026-06-12.", "evidence": [{"doc_id": "doc_000388", "sent_id": 8}], "id": "train_001656", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Orchid-3B for reranking on 2026-05-30.", "evidence": [], "id": "train_001657", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-26 selected Nimbus-3B for calibration on 2026-05-27.", "evidence": [{"doc_id": "doc_000090", "sent_id": 7}, {"doc_id": "doc_000460", "sent_id": 5}], "id": "train_001658", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-06-15 had a run with Quartz-7B that failed because of an unstable-validation-loss error on 2026-04-16 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000465", "sent_id": 2}, {"doc_id": "doc_000398", "sent_id": 3}], "id": "train_001659", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-05-25 received 5 GPUs from Node Laurel-11 on 2026-04-04.", "evidence": [{"doc_id": "doc_000388", "sent_id": 2}, {"doc_id": "doc_000311", "sent_id": 6}], "id": "train_001660", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Marble-7B achieved 0.556 evidence F1 on NereidNotes for Project Sonata on 2026-06-06.", "evidence": [{"doc_id": "doc_000267", "sent_id": 6}], "id": "train_001661", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Nested Verifier Study 4 reported on 2026-04-20 that it used cross-encoder reranking and did not use a reward model.", "evidence": [{"doc_id": "doc_000094", "sent_id": 2}], "id": "train_001662", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Darian Grant was assigned as the lead for Project Nereid on 2026-04-08.", "evidence": [], "id": "train_001663", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Anika Sato was assigned as the lead for Project Aster on 2026-05-18.", "evidence": [{"doc_id": "doc_000476", "sent_id": 2}], "id": "train_001664", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Vela-7B achieved 0.741 evidence F1 on LabQA-2 for Project Sonata on 2026-06-30.", "evidence": [{"doc_id": "doc_000488", "sent_id": 6}], "id": "train_001665", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Cedar-7B failed because of a checkpoint-mismatch error on 2026-06-04 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000007", "sent_id": 3}], "id": "train_001666", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Atlas-8B achieved a higher accuracy than Helix-8B.", "evidence": [{"doc_id": "doc_000006", "sent_id": 1}, {"doc_id": "doc_000089", "sent_id": 6}], "id": "train_001667", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Hazel-14 allocated 6 GPUs to Project Meridian on 2026-05-23.", "evidence": [{"doc_id": "doc_000344", "sent_id": 5}], "id": "train_001668", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Amber Ranking Study 2 reported on 2026-05-09 that it used confidence calibration and did not use a reward model.", "evidence": [], "id": "train_001669", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nora Sol was assigned as the retrieval owner for Project Nereid on 2026-05-27.", "evidence": [], "id": "train_001670", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from sentence pruning to cross-encoder reranking on 2026-04-20.", "evidence": [], "id": "train_001671", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Kestrel-7B for claim classification on 2026-05-29.", "evidence": [{"doc_id": "doc_000214", "sent_id": 8}], "id": "train_001672", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Atlas-7B for claim classification on 2026-06-11.", "evidence": [{"doc_id": "doc_000062", "sent_id": 3}], "id": "train_001673", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Julian Stone was assigned as the retrieval owner for Project Nereid on 2026-05-05.", "evidence": [{"doc_id": "doc_000238", "sent_id": 8}], "id": "train_001674", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Rohan Park was assigned as the lead for Project Sonata on 2026-04-15.", "evidence": [], "id": "train_001675", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Rowan-09 allocated 5 GPUs to Project Aster on 2026-05-23.", "evidence": [{"doc_id": "doc_000492", "sent_id": 6}], "id": "train_001676", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Nimbus-3B for calibration on 2026-06-04.", "evidence": [{"doc_id": "doc_000352", "sent_id": 3}], "id": "train_001677", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 3 GPUs to Project Aster on 2026-04-26.", "evidence": [], "id": "train_001678", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared macro F1 runs, Atlas-8B achieved a higher macro F1 than Kestrel-7B.", "evidence": [{"doc_id": "doc_000320", "sent_id": 4}, {"doc_id": "doc_000339", "sent_id": 7}], "id": "train_001679", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron changed its method from sentence pruning to cross-encoder reranking on 2026-06-07.", "evidence": [{"doc_id": "doc_000352", "sent_id": 4}], "id": "train_001680", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Leo Park was assigned as the evaluation owner for Project Anchor on 2026-06-29.", "evidence": [{"doc_id": "doc_000414", "sent_id": 2}], "id": "train_001681", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 6 GPUs to Project Meridian on 2026-05-02.", "evidence": [{"doc_id": "doc_000397", "sent_id": 6}], "id": "train_001682", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Quartz-8B achieved a higher macro F1 than River-8B.", "evidence": [{"doc_id": "doc_000240", "sent_id": 7}, {"doc_id": "doc_000080", "sent_id": 2}], "id": "train_001683", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-06-26 that it used hard-negative mining and did not use a reward model.", "evidence": [{"doc_id": "doc_000152", "sent_id": 8}], "id": "train_001684", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-05-18 moved the Milestone R1 deadline from 2026-06-05 to 2026-06-11 on 2026-05-22.", "evidence": [{"doc_id": "doc_000212", "sent_id": 2}, {"doc_id": "doc_000434", "sent_id": 8}], "id": "train_001685", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Evan Iyer was assigned as the retrieval owner for Project Nereid on 2026-06-15.", "evidence": [{"doc_id": "doc_000419", "sent_id": 2}], "id": "train_001686", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-06-08 moved the Milestone N1 deadline to 2026-06-18 on 2026-05-27.", "evidence": [{"doc_id": "doc_000042", "sent_id": 2}, {"doc_id": "doc_000249", "sent_id": 7}], "id": "train_001687", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Aster-3B achieved a higher latency efficiency score than Finch-3B.", "evidence": [{"doc_id": "doc_000192", "sent_id": 7}, {"doc_id": "doc_000344", "sent_id": 1}], "id": "train_001688", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Atlas-3B achieved 0.729 accuracy on SignalSet-3 for Project Meridian on 2026-06-13.", "evidence": [{"doc_id": "doc_000352", "sent_id": 6}], "id": "train_001689", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Talia Reed was assigned as the retrieval owner for Project Aster on 2026-06-23.", "evidence": [{"doc_id": "doc_000405", "sent_id": 7}], "id": "train_001690", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from QLoRA adaptation to structured prompting on 2026-06-10.", "evidence": [{"doc_id": "doc_000007", "sent_id": 5}], "id": "train_001691", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid's run with Nova-3B failed because of an unstable-validation-loss error on 2026-05-23 while using Node Aspen-01.", "evidence": [], "id": "train_001692", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 3 reported on 2026-05-08 that it used temporal filtering and did not use a reward model.", "evidence": [{"doc_id": "doc_000095", "sent_id": 8}], "id": "train_001693", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Orchid-7B achieved a higher accuracy than Aster-8B.", "evidence": [{"doc_id": "doc_000070", "sent_id": 6}, {"doc_id": "doc_000437", "sent_id": 6}], "id": "train_001694", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "River-3B achieved 0.839 accuracy on SignalSet-2 for Project Anchor on 2026-05-09.", "evidence": [{"doc_id": "doc_000165", "sent_id": 6}], "id": "train_001695", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Atlas-8B failed because of an unstable-validation-loss error on 2026-06-25 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000295", "sent_id": 3}], "id": "train_001696", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from temporal filtering to alias expansion on 2026-04-27.", "evidence": [], "id": "train_001697", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-04-28 that it used QLoRA adaptation and did not use a reward model.", "evidence": [], "id": "train_001698", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-10 recorded macro F1 for Aster-3B on RiverBench-3 using Node Willow-05 on 2026-06-06.", "evidence": [{"doc_id": "doc_000280", "sent_id": 4}, {"doc_id": "doc_000388", "sent_id": 6}], "id": "train_001699", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from reward reranking to data mixing on 2026-05-24.", "evidence": [{"doc_id": "doc_000177", "sent_id": 4}], "id": "train_001700", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Vela-8B failed on Node Fir-10 because of a missing-index error on 2026-05-07.", "evidence": [{"doc_id": "doc_000489", "sent_id": 3}], "id": "train_001701", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Talia Marin was assigned as the lead for Project Aster on 2026-06-03.", "evidence": [], "id": "train_001702", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Lumen-7B failed because of a missing-index error on 2026-05-14 while using Node Poplar-12.", "evidence": [{"doc_id": "doc_000492", "sent_id": 3}], "id": "train_001703", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-06-08 selected Aster-7B for evidence retrieval on 2026-05-31.", "evidence": [{"doc_id": "doc_000204", "sent_id": 2}, {"doc_id": "doc_000409", "sent_id": 4}], "id": "train_001704", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from threshold search to reward reranking on 2026-06-07.", "evidence": [{"doc_id": "doc_000481", "sent_id": 4}], "id": "train_001705", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-05-18 received 3 GPUs from Node Maple-01 on 2026-04-18.", "evidence": [{"doc_id": "doc_000256", "sent_id": 2}, {"doc_id": "doc_000058", "sent_id": 6}], "id": "train_001706", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Orchid-3B achieved a higher evidence F1 than Finch-7B.", "evidence": [{"doc_id": "doc_000227", "sent_id": 6}, {"doc_id": "doc_000287", "sent_id": 2}], "id": "train_001707", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron changed its method from alias expansion to dense retrieval on 2026-06-26.", "evidence": [], "id": "train_001708", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-12 selected Nova-3B for reranking on 2026-06-18.", "evidence": [{"doc_id": "doc_000165", "sent_id": 7}, {"doc_id": "doc_000193", "sent_id": 4}], "id": "train_001709", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from QLoRA adaptation to hybrid retrieval on 2026-05-29.", "evidence": [], "id": "train_001710", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster had a failed run with Mira-7B on Node Juniper-06 because of an unstable-validation-loss error on 2026-06-19.", "evidence": [{"doc_id": "doc_000007", "sent_id": 8}], "id": "train_001711", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Owen Torres was assigned as the evaluation owner for Project Saffron on 2026-04-22.", "evidence": [], "id": "train_001712", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Marble-7B achieved a higher macro F1 than Helix-7B.", "evidence": [{"doc_id": "doc_000309", "sent_id": 7}, {"doc_id": "doc_000069", "sent_id": 7}], "id": "train_001713", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-05-20 to 2026-05-30 on 2026-05-07.", "evidence": [], "id": "train_001714", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Orchid-3B achieved 0.860 evidence F1 on LabQA-3 for Project Aster on 2026-05-23.", "evidence": [{"doc_id": "doc_000071", "sent_id": 6}], "id": "train_001715", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-26 received 3 GPUs from Node Elm-08 on 2026-04-04.", "evidence": [{"doc_id": "doc_000241", "sent_id": 7}, {"doc_id": "doc_000150", "sent_id": 6}], "id": "train_001716", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor moved the Milestone Z1 deadline from 2026-05-14 to 2026-05-22 on 2026-05-06.", "evidence": [{"doc_id": "doc_000412", "sent_id": 5}], "id": "train_001717", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Atlas-7B for calibration on 2026-05-31.", "evidence": [{"doc_id": "doc_000413", "sent_id": 3}], "id": "train_001718", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-06-07 on 2026-05-09.", "evidence": [], "id": "train_001719", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor moved the Milestone T1 deadline from 2026-04-27 to 2026-05-03 on 2026-04-18.", "evidence": [], "id": "train_001720", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-05-02 to 2026-05-06 on 2026-04-08.", "evidence": [{"doc_id": "doc_000246", "sent_id": 5}], "id": "train_001721", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mara Lane was assigned as the retrieval owner for Project Sonata on 2026-04-15.", "evidence": [], "id": "train_001722", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-05-05 changed its method from hard-negative mining to metric smoothing on 2026-04-09.", "evidence": [{"doc_id": "doc_000368", "sent_id": 7}, {"doc_id": "doc_000464", "sent_id": 3}], "id": "train_001723", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Aster-8B achieved 0.544 evidence F1 on RiverBench-3 for Project Sonata on 2026-04-21.", "evidence": [], "id": "train_001724", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Lattice Memory Study 3 reported on 2026-05-04 that it used hard-negative mining and used a reward model.", "evidence": [{"doc_id": "doc_000087", "sent_id": 1}], "id": "train_001725", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-3B achieved 0.809 evidence F1 on TraceEval-2 for Project Nereid on 2026-05-11.", "evidence": [{"doc_id": "doc_000232", "sent_id": 2}], "id": "train_001726", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Julian Gray was assigned as the lead for Project Nereid on 2026-05-04.", "evidence": [{"doc_id": "doc_000401", "sent_id": 2}], "id": "train_001727", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vera Kim was assigned as the lead for Project Nereid on 2026-06-09.", "evidence": [{"doc_id": "doc_000262", "sent_id": 7}], "id": "train_001728", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Maple-01 allocated 3 GPUs to Project Sonata on 2026-05-10.", "evidence": [], "id": "train_001729", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian's run with Atlas-7B failed because of a checkpoint-mismatch error on 2026-05-28 while using Node Juniper-06.", "evidence": [{"doc_id": "doc_000128", "sent_id": 3}], "id": "train_001730", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Petra Gray was assigned as the evaluation owner for Project Saffron on 2026-04-21.", "evidence": [{"doc_id": "doc_000019", "sent_id": 7}], "id": "train_001731", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Dr. Arun Kwan was assigned as the data steward for Project Anchor on 2026-06-17.", "evidence": [], "id": "train_001732", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 4 reported on 2026-06-08 that it used LoRA adaptation and used a reward model.", "evidence": [{"doc_id": "doc_000494", "sent_id": 2}], "id": "train_001733", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster's run with Orchid-3B failed on Node Cedar-02 because of a missing-index error on 2026-04-23.", "evidence": [{"doc_id": "doc_000499", "sent_id": 3}], "id": "train_001734", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Aster-7B for evidence retrieval on 2026-04-16.", "evidence": [{"doc_id": "doc_000073", "sent_id": 3}], "id": "train_001735", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from hard-negative mining to LoRA adaptation on 2026-04-12.", "evidence": [{"doc_id": "doc_000114", "sent_id": 4}], "id": "train_001736", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared evidence F1 runs, Lumen-7B achieved a higher evidence F1 than River-7B.", "evidence": [{"doc_id": "doc_000196", "sent_id": 7}, {"doc_id": "doc_000493", "sent_id": 2}], "id": "train_001737", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-7B achieved 0.761 evidence F1 on LabQA for Project Nereid on 2026-05-18.", "evidence": [{"doc_id": "doc_000456", "sent_id": 2}], "id": "train_001738", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster moved the Milestone T1 deadline to 2026-06-09 on 2026-05-30.", "evidence": [], "id": "train_001739", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid changed its method from calibrated voting to confidence calibration on 2026-05-17.", "evidence": [{"doc_id": "doc_000071", "sent_id": 4}], "id": "train_001740", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-06-02 selected Mira-8B for calibration on 2026-04-14.", "evidence": [{"doc_id": "doc_000417", "sent_id": 7}, {"doc_id": "doc_000235", "sent_id": 7}], "id": "train_001741", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Cedar-7B for error analysis on 2026-04-10.", "evidence": [{"doc_id": "doc_000103", "sent_id": 10}], "id": "train_001742", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Sofia Nadir was assigned as the retrieval owner on 2026-06-02 changed its method from metric smoothing to document chunking on 2026-04-15.", "evidence": [{"doc_id": "doc_000417", "sent_id": 7}, {"doc_id": "doc_000455", "sent_id": 5}], "id": "train_001743", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from cross-encoder reranking to late interaction on 2026-05-10.", "evidence": [{"doc_id": "doc_000316", "sent_id": 5}], "id": "train_001744", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Helix-8B failed on Node Poplar-12 because of an unstable-validation-loss error on 2026-04-02.", "evidence": [{"doc_id": "doc_000146", "sent_id": 3}], "id": "train_001745", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Pine-07 allocated 4 GPUs to Project Saffron on 2026-06-26.", "evidence": [], "id": "train_001746", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Birch-04 allocated 4 GPUs to Project Meridian on 2026-06-09.", "evidence": [], "id": "train_001747", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-04-27 had a failed run with Kestrel-3B on Node Birch-04 because of an unstable-validation-loss error on 2026-06-19.", "evidence": [{"doc_id": "doc_000394", "sent_id": 2}, {"doc_id": "doc_000072", "sent_id": 8}], "id": "train_001748", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 4 reported on 2026-04-25 that it used rank fusion and did not use a reward model.", "evidence": [], "id": "train_001749", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Sycamore-13 allocated 3 GPUs to Project Sonata on 2026-04-21.", "evidence": [], "id": "train_001750", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Cedar-7B achieved 0.857 macro F1 on NereidNotes-3 for Project Nereid on 2026-04-21.", "evidence": [{"doc_id": "doc_000455", "sent_id": 7}], "id": "train_001751", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Lumen-3B on Node Sycamore-13 because of a missing-index error on 2026-04-23.", "evidence": [{"doc_id": "doc_000387", "sent_id": 3}], "id": "train_001752", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Atlas-7B for error analysis on 2026-05-31.", "evidence": [{"doc_id": "doc_000207", "sent_id": 4}], "id": "train_001753", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Quartz-7B failed because of a missing-index error on 2026-05-21 while using Node Juniper-06.", "evidence": [{"doc_id": "doc_000456", "sent_id": 3}], "id": "train_001754", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared accuracy runs, Nimbus-8B achieved a higher accuracy than Aster-8B.", "evidence": [{"doc_id": "doc_000289", "sent_id": 6}, {"doc_id": "doc_000437", "sent_id": 6}], "id": "train_001755", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Dr. Noah Vale was assigned as the lead for Project Aster on 2026-04-07.", "evidence": [{"doc_id": "doc_000103", "sent_id": 9}], "id": "train_001756", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata selected Vela-8B for reranking on 2026-04-23.", "evidence": [{"doc_id": "doc_000390", "sent_id": 3}], "id": "train_001757", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Maple-01 allocated 3 GPUs to Project Sonata on 2026-05-16.", "evidence": [{"doc_id": "doc_000489", "sent_id": 6}], "id": "train_001758", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mina Adler was assigned as the retrieval owner for Project Aster on 2026-05-18.", "evidence": [{"doc_id": "doc_000290", "sent_id": 2}], "id": "train_001759", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Anika Costa was assigned as the retrieval owner for Project Aster on 2026-05-25.", "evidence": [{"doc_id": "doc_000375", "sent_id": 2}], "id": "train_001760", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from alias expansion to temporal filtering on 2026-04-08.", "evidence": [{"doc_id": "doc_000373", "sent_id": 5}], "id": "train_001761", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-05-05 received 2 GPUs from Node Aspen-01 on 2026-05-07.", "evidence": [{"doc_id": "doc_000346", "sent_id": 7}, {"doc_id": "doc_000280", "sent_id": 3}], "id": "train_001762", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Atlas-8B failed because of a missing-index error on 2026-05-15 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000252", "sent_id": 8}], "id": "train_001763", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-7B for claim classification on 2026-05-09.", "evidence": [], "id": "train_001764", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian changed its method from data mixing to contrastive tuning on 2026-04-13.", "evidence": [], "id": "train_001765", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared evidence F1 runs, Atlas-3B achieved a higher evidence F1 than Nimbus-8B.", "evidence": [{"doc_id": "doc_000120", "sent_id": 2}, {"doc_id": "doc_000189", "sent_id": 7}], "id": "train_001766", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid changed its method from sentence pruning to cross-encoder reranking on 2026-06-28.", "evidence": [{"doc_id": "doc_000142", "sent_id": 4}], "id": "train_001767", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Finch-8B achieved a higher accuracy than Nimbus-7B.", "evidence": [{"doc_id": "doc_000215", "sent_id": 7}, {"doc_id": "doc_000040", "sent_id": 2}], "id": "train_001768", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-11 received 4 GPUs from Node Maple-01 on 2026-05-30.", "evidence": [{"doc_id": "doc_000241", "sent_id": 2}, {"doc_id": "doc_000266", "sent_id": 6}], "id": "train_001769", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-06-02 moved the Milestone D1 deadline to 2026-05-22 on 2026-04-30.", "evidence": [{"doc_id": "doc_000212", "sent_id": 3}, {"doc_id": "doc_000166", "sent_id": 3}], "id": "train_001770", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Quartz-3B for reranking on 2026-04-26.", "evidence": [{"doc_id": "doc_000387", "sent_id": 4}], "id": "train_001771", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata changed its method from hybrid retrieval to QLoRA adaptation on 2026-05-20.", "evidence": [{"doc_id": "doc_000213", "sent_id": 5}], "id": "train_001772", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Marble-3B for evidence retrieval on 2026-05-24.", "evidence": [{"doc_id": "doc_000456", "sent_id": 4}], "id": "train_001773", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid changed its method from LoRA adaptation to alias expansion on 2026-06-07.", "evidence": [{"doc_id": "doc_000037", "sent_id": 4}], "id": "train_001774", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor selected Orchid-7B for reranking on 2026-04-13.", "evidence": [], "id": "train_001775", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Laurel-11 allocated 1 GPU to Project Nereid on 2026-06-14.", "evidence": [], "id": "train_001776", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Quartz-7B for claim classification on 2026-05-22.", "evidence": [{"doc_id": "doc_000284", "sent_id": 8}], "id": "train_001777", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-04-27 received 2 GPUs from Node Aspen-01 on 2026-04-18.", "evidence": [{"doc_id": "doc_000151", "sent_id": 2}, {"doc_id": "doc_000074", "sent_id": 5}], "id": "train_001778", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Juniper-06 allocated 4 GPUs to Project Anchor on 2026-05-23.", "evidence": [], "id": "train_001779", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Juniper-06 allocated 1 GPU to Project Aster on 2026-04-08.", "evidence": [], "id": "train_001780", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-12 received 3 GPUs from Node Spruce-03 on 2026-05-22.", "evidence": [{"doc_id": "doc_000151", "sent_id": 7}, {"doc_id": "doc_000100", "sent_id": 8}], "id": "train_001781", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-7B achieved 0.815 evidence F1 on RiverBench-2 for Project Nereid on 2026-06-23.", "evidence": [{"doc_id": "doc_000261", "sent_id": 7}], "id": "train_001782", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Spruce-03 allocated 1 GPU to Project Aster on 2026-06-28.", "evidence": [], "id": "train_001783", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-04-21 selected Finch-7B for error analysis on 2026-05-28.", "evidence": [{"doc_id": "doc_000114", "sent_id": 7}, {"doc_id": "doc_000384", "sent_id": 4}], "id": "train_001784", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-04-30 that it used reward reranking and did not use a reward model.", "evidence": [], "id": "train_001785", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Lumen-3B achieved a higher latency efficiency score than Helix-3B.", "evidence": [{"doc_id": "doc_000054", "sent_id": 2}, {"doc_id": "doc_000032", "sent_id": 7}], "id": "train_001786", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-06-03 on 2026-05-27.", "evidence": [], "id": "train_001787", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from metric smoothing to hard-negative mining on 2026-05-31.", "evidence": [{"doc_id": "doc_000388", "sent_id": 4}], "id": "train_001788", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-05-05 on 2026-04-29.", "evidence": [{"doc_id": "doc_000238", "sent_id": 5}], "id": "train_001789", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Ravi Reed was assigned as the evaluation owner on 2026-06-09 selected Atlas-7B for evidence retrieval on 2026-06-12.", "evidence": [{"doc_id": "doc_000384", "sent_id": 9}, {"doc_id": "doc_000116", "sent_id": 8}], "id": "train_001790", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-19 had a run with Cedar-8B that failed because of a missing-index error on 2026-05-22 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000016", "sent_id": 4}, {"doc_id": "doc_000208", "sent_id": 8}], "id": "train_001791", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mira-3B achieved 0.613 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-05-23.", "evidence": [{"doc_id": "doc_000214", "sent_id": 6}], "id": "train_001792", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster changed its method from LoRA adaptation to alias expansion on 2026-04-26.", "evidence": [{"doc_id": "doc_000298", "sent_id": 4}], "id": "train_001793", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Orchid-3B for evidence retrieval on 2026-05-17.", "evidence": [{"doc_id": "doc_000213", "sent_id": 4}], "id": "train_001794", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-3B achieved 0.568 evidence F1 on TraceEval-3 for Project Sonata on 2026-06-27.", "evidence": [{"doc_id": "doc_000419", "sent_id": 7}], "id": "train_001795", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-06-16 recorded accuracy for Mira-3B on MemoTrace-2 using Node Sycamore-13 on 2026-04-20.", "evidence": [{"doc_id": "doc_000463", "sent_id": 9}, {"doc_id": "doc_000499", "sent_id": 2}], "id": "train_001796", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-3B achieved 0.949 macro F1 on TraceEval for Project Aster on 2026-04-13.", "evidence": [{"doc_id": "doc_000183", "sent_id": 1}], "id": "train_001797", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-13 received 3 GPUs from Node Willow-05 on 2026-06-17.", "evidence": [{"doc_id": "doc_000234", "sent_id": 2}, {"doc_id": "doc_000345", "sent_id": 5}], "id": "train_001798", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-06-22 received 6 GPUs from Node Laurel-11 on 2026-04-22.", "evidence": [{"doc_id": "doc_000103", "sent_id": 2}, {"doc_id": "doc_000314", "sent_id": 5}], "id": "train_001799", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from metric smoothing to hard-negative mining on 2026-04-19.", "evidence": [{"doc_id": "doc_000402", "sent_id": 4}], "id": "train_001800", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-13 recorded macro F1 for Aster-7B on RiverBench-3 using Node Willow-05 on 2026-04-27.", "evidence": [{"doc_id": "doc_000234", "sent_id": 2}, {"doc_id": "doc_000276", "sent_id": 2}], "id": "train_001801", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Orchid-8B achieved a higher evidence F1 than Cedar-8B.", "evidence": [{"doc_id": "doc_000015", "sent_id": 6}, {"doc_id": "doc_000133", "sent_id": 6}], "id": "train_001802", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-05-11 selected Lumen-7B for calibration on 2026-06-25.", "evidence": [{"doc_id": "doc_000138", "sent_id": 2}, {"doc_id": "doc_000142", "sent_id": 3}], "id": "train_001803", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Vela-7B failed on Node Fir-10 because of a missing-index error on 2026-04-03.", "evidence": [{"doc_id": "doc_000108", "sent_id": 8}], "id": "train_001804", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Kestrel-3B for claim classification on 2026-04-11.", "evidence": [], "id": "train_001805", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid selected Kestrel-3B for claim classification on 2026-04-10.", "evidence": [{"doc_id": "doc_000182", "sent_id": 8}], "id": "train_001806", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-8B achieved 0.678 accuracy on CedarQA-2 for Project Saffron on 2026-06-01.", "evidence": [{"doc_id": "doc_000430", "sent_id": 2}], "id": "train_001807", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian's run with Nimbus-3B failed because of a checkpoint-mismatch error on 2026-04-04 while using Node Elm-08.", "evidence": [], "id": "train_001808", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-24 recorded latency efficiency score for Nova-7B on LumenFacts-2 using Node Elm-08 on 2026-04-13.", "evidence": [{"doc_id": "doc_000226", "sent_id": 4}, {"doc_id": "doc_000398", "sent_id": 2}], "id": "train_001809", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Aster-8B on Node Maple-01 because of a missing-index error on 2026-06-26.", "evidence": [{"doc_id": "doc_000261", "sent_id": 8}], "id": "train_001810", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor had a failed run with Kestrel-3B on Node Spruce-03 because of a checkpoint-mismatch error on 2026-05-15.", "evidence": [{"doc_id": "doc_000137", "sent_id": 8}], "id": "train_001811", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-04-25 that it used evidence pooling and did not use a reward model.", "evidence": [], "id": "train_001812", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-05-27 on 2026-05-06.", "evidence": [{"doc_id": "doc_000165", "sent_id": 5}], "id": "train_001813", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-06-09 recorded latency efficiency score for Helix-8B on CedarQA-3 using Node Poplar-12 on 2026-05-10.", "evidence": [{"doc_id": "doc_000388", "sent_id": 7}, {"doc_id": "doc_000168", "sent_id": 4}], "id": "train_001814", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-04-30 to 2026-05-06 on 2026-04-09.", "evidence": [], "id": "train_001815", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Atlas-7B for calibration on 2026-05-21.", "evidence": [{"doc_id": "doc_000417", "sent_id": 3}], "id": "train_001816", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Laurel-11 allocated 4 GPUs to Project Meridian on 2026-05-31.", "evidence": [], "id": "train_001817", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-18 changed its method from metric smoothing to document chunking on 2026-06-24.", "evidence": [{"doc_id": "doc_000417", "sent_id": 2}, {"doc_id": "doc_000027", "sent_id": 5}], "id": "train_001818", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster had a failed run with Orchid-7B on Node Rowan-09 because of an unstable-validation-loss error on 2026-05-21.", "evidence": [{"doc_id": "doc_000363", "sent_id": 3}], "id": "train_001819", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron changed its method from structured prompting to chain verification on 2026-06-03.", "evidence": [{"doc_id": "doc_000140", "sent_id": 5}], "id": "train_001820", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Quartz-8B for error analysis on 2026-05-10.", "evidence": [{"doc_id": "doc_000086", "sent_id": 4}], "id": "train_001821", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Lumen-8B for calibration on 2026-05-08.", "evidence": [{"doc_id": "doc_000285", "sent_id": 8}], "id": "train_001822", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor changed its method from alias expansion to LoRA adaptation on 2026-06-03.", "evidence": [{"doc_id": "doc_000128", "sent_id": 5}], "id": "train_001823", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Dr. Noah Vale was assigned as the lead for Project Aster on 2026-06-24.", "evidence": [], "id": "train_001824", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from dense retrieval to rank fusion on 2026-04-30.", "evidence": [], "id": "train_001825", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from BM25 retrieval to evidence pooling on 2026-05-01.", "evidence": [], "id": "train_001826", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Kira Frost was assigned as the data steward for Project Anchor on 2026-04-15.", "evidence": [], "id": "train_001827", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Nested Verifier Study 2 reported on 2026-05-26 that it used hard-negative mining and did not use a reward model.", "evidence": [], "id": "train_001828", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from rank fusion to teacher distillation on 2026-04-20.", "evidence": [], "id": "train_001829", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Iyer was assigned as the retrieval owner on 2026-04-28 moved the Milestone B2 deadline to 2026-06-06 on 2026-05-28.", "evidence": [{"doc_id": "doc_000039", "sent_id": 7}, {"doc_id": "doc_000160", "sent_id": 3}], "id": "train_001830", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-8B achieved 0.708 accuracy on CedarQA-2 for Project Saffron on 2026-05-26.", "evidence": [{"doc_id": "doc_000337", "sent_id": 7}], "id": "train_001831", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-05-12 moved the Milestone T1 deadline from 2026-04-30 to 2026-05-08 on 2026-04-10.", "evidence": [{"doc_id": "doc_000394", "sent_id": 7}, {"doc_id": "doc_000420", "sent_id": 8}], "id": "train_001832", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Aster-8B achieved 0.724 evidence F1 on RiverBench-3 for Project Sonata on 2026-06-15.", "evidence": [], "id": "train_001833", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-04 recorded accuracy for Lumen-7B on MemoTrace-3 using Node Spruce-03 on 2026-04-05.", "evidence": [{"doc_id": "doc_000316", "sent_id": 2}, {"doc_id": "doc_000243", "sent_id": 4}], "id": "train_001834", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Birch-04 allocated 2 GPUs to Project Meridian on 2026-06-18.", "evidence": [], "id": "train_001835", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from temporal filtering to alias expansion on 2026-05-24.", "evidence": [{"doc_id": "doc_000476", "sent_id": 5}], "id": "train_001836", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Vera Torres was assigned as the retrieval owner on 2026-06-02 selected Atlas-3B for error analysis on 2026-04-19.", "evidence": [{"doc_id": "doc_000403", "sent_id": 7}, {"doc_id": "doc_000264", "sent_id": 4}], "id": "train_001837", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-05-26 on 2026-05-07.", "evidence": [], "id": "train_001838", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Cedar-8B for error analysis on 2026-05-14.", "evidence": [{"doc_id": "doc_000138", "sent_id": 3}], "id": "train_001839", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from teacher distillation to query rewriting on 2026-05-25.", "evidence": [], "id": "train_001840", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Evan Iyer was assigned as the retrieval owner on 2026-04-28 moved the Milestone V1 deadline to 2026-06-05 on 2026-05-25.", "evidence": [{"doc_id": "doc_000039", "sent_id": 7}, {"doc_id": "doc_000270", "sent_id": 2}], "id": "train_001841", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-28 had a failed run with Mira-8B on Node Sycamore-13 because of an out-of-memory error on 2026-05-29.", "evidence": [{"doc_id": "doc_000159", "sent_id": 6}, {"doc_id": "doc_000492", "sent_id": 8}], "id": "train_001842", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-06-27 on 2026-05-31.", "evidence": [], "id": "train_001843", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Noah Chen was assigned as the retrieval owner on 2026-04-06 received 5 GPUs from Node Rowan-09 on 2026-05-09.", "evidence": [{"doc_id": "doc_000271", "sent_id": 2}, {"doc_id": "doc_000129", "sent_id": 6}], "id": "train_001844", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Vela-8B achieved 0.636 latency efficiency score on LumenFacts-3 for Project Saffron on 2026-05-04.", "evidence": [{"doc_id": "doc_000450", "sent_id": 2}], "id": "train_001845", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-04-24 that it used evidence pooling and used a reward model.", "evidence": [{"doc_id": "doc_000464", "sent_id": 8}], "id": "train_001846", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Kestrel-3B achieved 0.863 evidence F1 on OrionBench-3 for Project Nereid on 2026-06-16.", "evidence": [{"doc_id": "doc_000430", "sent_id": 7}], "id": "train_001847", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian changed its method from query rewriting to sentence pruning on 2026-05-24.", "evidence": [{"doc_id": "doc_000403", "sent_id": 4}], "id": "train_001848", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Helix-7B achieved 0.617 accuracy on CedarQA-3 for Project Anchor on 2026-05-18.", "evidence": [], "id": "train_001849", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected Atlas-7B for reranking on 2026-05-03.", "evidence": [{"doc_id": "doc_000215", "sent_id": 4}], "id": "train_001850", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Vera Torres was assigned as the retrieval owner on 2026-06-02 selected Kestrel-8B for claim classification on 2026-04-23.", "evidence": [{"doc_id": "doc_000403", "sent_id": 7}, {"doc_id": "doc_000036", "sent_id": 4}], "id": "train_001851", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Quartz-8B for claim classification on 2026-05-21.", "evidence": [{"doc_id": "doc_000177", "sent_id": 3}], "id": "train_001852", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from hybrid retrieval to document chunking on 2026-05-10.", "evidence": [{"doc_id": "doc_000117", "sent_id": 4}], "id": "train_001853", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata changed its method from evidence pooling to calibrated voting on 2026-05-03.", "evidence": [{"doc_id": "doc_000412", "sent_id": 4}], "id": "train_001854", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-05-15 on 2026-04-16.", "evidence": [], "id": "train_001855", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-11 selected Nova-3B for reranking on 2026-05-12.", "evidence": [{"doc_id": "doc_000319", "sent_id": 2}, {"doc_id": "doc_000258", "sent_id": 7}], "id": "train_001856", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Aster-3B for evidence retrieval on 2026-04-02.", "evidence": [{"doc_id": "doc_000374", "sent_id": 3}], "id": "train_001857", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Iyer was assigned as the retrieval owner on 2026-04-28 changed its method from QLoRA adaptation to hybrid retrieval on 2026-05-09.", "evidence": [{"doc_id": "doc_000039", "sent_id": 7}, {"doc_id": "doc_000277", "sent_id": 6}], "id": "train_001858", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-06-22 recorded accuracy for Cedar-8B on VestaLogs-2 using Node Laurel-11 on 2026-04-19.", "evidence": [{"doc_id": "doc_000103", "sent_id": 2}, {"doc_id": "doc_000339", "sent_id": 4}], "id": "train_001859", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-05-26 selected Mira-3B for calibration on 2026-04-09.", "evidence": [{"doc_id": "doc_000138", "sent_id": 7}, {"doc_id": "doc_000437", "sent_id": 3}], "id": "train_001860", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Owen Marin was assigned as the data steward on 2026-06-15 recorded latency efficiency score for Aster-8B on SignalSet using Node Willow-05 on 2026-04-28.", "evidence": [{"doc_id": "doc_000288", "sent_id": 2}, {"doc_id": "doc_000222", "sent_id": 7}], "id": "train_001861", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Stable Chains Study 2 reported on 2026-05-19 that it used BM25 retrieval and did not use a reward model.", "evidence": [], "id": "train_001862", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Mira-3B achieved 0.729 evidence F1 on TraceEval-3 for Project Sonata on 2026-06-09.", "evidence": [{"doc_id": "doc_000486", "sent_id": 7}], "id": "train_001863", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Mira-7B for calibration on 2026-05-22.", "evidence": [{"doc_id": "doc_000162", "sent_id": 8}], "id": "train_001864", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from confidence calibration to chain verification on 2026-04-05.", "evidence": [{"doc_id": "doc_000255", "sent_id": 4}], "id": "train_001865", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-04-22.", "evidence": [], "id": "train_001866", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Hazel-14 allocated 4 GPUs to Project Meridian on 2026-04-02.", "evidence": [], "id": "train_001867", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-3B achieved 0.817 latency efficiency score on CedarQA-2 for Project Saffron on 2026-04-08.", "evidence": [], "id": "train_001868", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-04-20 received 3 GPUs from Node Willow-05 on 2026-06-10.", "evidence": [{"doc_id": "doc_000346", "sent_id": 2}, {"doc_id": "doc_000421", "sent_id": 5}], "id": "train_001869", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from hard-negative mining to LoRA adaptation on 2026-06-21.", "evidence": [{"doc_id": "doc_000231", "sent_id": 7}], "id": "train_001870", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared evidence F1 runs, Aster-3B achieved a higher evidence F1 than Helix-3B.", "evidence": [{"doc_id": "doc_000077", "sent_id": 6}, {"doc_id": "doc_000273", "sent_id": 4}], "id": "train_001871", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian selected Kestrel-7B for claim classification on 2026-06-04.", "evidence": [{"doc_id": "doc_000463", "sent_id": 3}], "id": "train_001872", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Atlas-7B for error analysis on 2026-06-21.", "evidence": [{"doc_id": "doc_000089", "sent_id": 3}], "id": "train_001873", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Quartz-8B achieved a higher evidence F1 than Cedar-3B.", "evidence": [{"doc_id": "doc_000395", "sent_id": 6}, {"doc_id": "doc_000426", "sent_id": 6}], "id": "train_001874", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Atlas-8B achieved a higher accuracy than Vela-7B.", "evidence": [{"doc_id": "doc_000417", "sent_id": 6}, {"doc_id": "doc_000319", "sent_id": 6}], "id": "train_001875", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-06-15 moved the Milestone X1 deadline to 2026-05-28 on 2026-05-06.", "evidence": [{"doc_id": "doc_000465", "sent_id": 2}, {"doc_id": "doc_000307", "sent_id": 5}], "id": "train_001876", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Nimbus-8B failed because of a checkpoint-mismatch error on 2026-06-25 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000253", "sent_id": 3}], "id": "train_001877", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor changed its method from confidence calibration to chain verification on 2026-06-19.", "evidence": [], "id": "train_001878", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron's run with Aster-3B failed because of a missing-index error on 2026-06-05 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000363", "sent_id": 8}], "id": "train_001879", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron changed its method from QLoRA adaptation to structured prompting on 2026-05-24.", "evidence": [{"doc_id": "doc_000417", "sent_id": 4}], "id": "train_001880", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Talia Marin was assigned as the lead for Project Aster on 2026-06-16.", "evidence": [{"doc_id": "doc_000169", "sent_id": 7}], "id": "train_001881", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Nimbus-3B for calibration on 2026-06-01.", "evidence": [], "id": "train_001882", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Delta Evidence Study 4 reported on 2026-05-29 that it used BM25 retrieval and did not use a reward model.", "evidence": [{"doc_id": "doc_000107", "sent_id": 8}], "id": "train_001883", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-04-20 had a run with Marble-3B that failed because of a missing-index error on 2026-05-14 while using Node Maple-01.", "evidence": [{"doc_id": "doc_000260", "sent_id": 2}, {"doc_id": "doc_000321", "sent_id": 3}], "id": "train_001884", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from query rewriting to teacher distillation on 2026-04-08.", "evidence": [{"doc_id": "doc_000484", "sent_id": 5}], "id": "train_001885", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Kestrel-3B failed because of an out-of-memory error on 2026-06-20 while using Node Birch-04.", "evidence": [], "id": "train_001886", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Cedar-7B achieved a higher evidence F1 than River-7B.", "evidence": [{"doc_id": "doc_000302", "sent_id": 2}, {"doc_id": "doc_000208", "sent_id": 7}], "id": "train_001887", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-05-11 selected Quartz-7B for claim classification on 2026-06-24.", "evidence": [{"doc_id": "doc_000090", "sent_id": 2}, {"doc_id": "doc_000340", "sent_id": 4}], "id": "train_001888", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Aster-8B failed on Node Willow-05 because of a checkpoint-mismatch error on 2026-05-08.", "evidence": [{"doc_id": "doc_000257", "sent_id": 8}], "id": "train_001889", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Maple-01 allocated 5 GPUs to Project Sonata on 2026-05-23.", "evidence": [{"doc_id": "doc_000232", "sent_id": 6}], "id": "train_001890", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-04-07 had a run with Helix-7B on Node Poplar-12 that failed because of an unstable-validation-loss error on 2026-04-10.", "evidence": [{"doc_id": "doc_000365", "sent_id": 7}, {"doc_id": "doc_000393", "sent_id": 8}], "id": "train_001891", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-18 changed its method from calibrated voting to evidence pooling on 2026-04-18.", "evidence": [{"doc_id": "doc_000417", "sent_id": 2}, {"doc_id": "doc_000457", "sent_id": 6}], "id": "train_001892", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from alias expansion to LoRA adaptation on 2026-04-01.", "evidence": [{"doc_id": "doc_000295", "sent_id": 5}], "id": "train_001893", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-08 changed its method from cross-encoder reranking to sentence pruning on 2026-04-19.", "evidence": [{"doc_id": "doc_000062", "sent_id": 2}, {"doc_id": "doc_000005", "sent_id": 4}], "id": "train_001894", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Atlas-7B for evidence retrieval on 2026-04-30.", "evidence": [{"doc_id": "doc_000015", "sent_id": 3}], "id": "train_001895", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-05-14 that it used confidence calibration and did not use a reward model.", "evidence": [], "id": "train_001896", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-3B achieved 0.816 latency efficiency score on CedarQA-3 for Project Anchor on 2026-06-23.", "evidence": [{"doc_id": "doc_000032", "sent_id": 7}], "id": "train_001897", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mina Torres was assigned as the retrieval owner for Project Sonata on 2026-04-27.", "evidence": [{"doc_id": "doc_000015", "sent_id": 2}], "id": "train_001898", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Finch-3B for error analysis on 2026-04-20.", "evidence": [], "id": "train_001899", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-8B achieved 0.940 accuracy on SignalSet-2 for Project Anchor on 2026-04-07.", "evidence": [{"doc_id": "doc_000318", "sent_id": 7}], "id": "train_001900", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Marble-7B achieved 0.751 accuracy on VestaLogs-3 for Project Saffron on 2026-05-25.", "evidence": [], "id": "train_001901", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Noah Chen was assigned as the retrieval owner for Project Aster on 2026-06-28.", "evidence": [], "id": "train_001902", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-01 changed its method from BM25 retrieval to late interaction on 2026-04-09.", "evidence": [{"doc_id": "doc_000093", "sent_id": 2}, {"doc_id": "doc_000324", "sent_id": 3}], "id": "train_001903", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Talia Reed was assigned as the retrieval owner for Project Aster on 2026-05-27.", "evidence": [], "id": "train_001904", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Theo Grant was assigned as the data steward for Project Saffron on 2026-04-27.", "evidence": [{"doc_id": "doc_000307", "sent_id": 2}], "id": "train_001905", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-04-07 moved the Milestone H1 deadline from 2026-05-23 to 2026-05-27 on 2026-04-29.", "evidence": [{"doc_id": "doc_000365", "sent_id": 7}, {"doc_id": "doc_000133", "sent_id": 5}], "id": "train_001906", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-06-19 on 2026-05-27.", "evidence": [{"doc_id": "doc_000417", "sent_id": 5}], "id": "train_001907", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Aster-7B achieved a higher evidence F1 than Finch-7B.", "evidence": [{"doc_id": "doc_000230", "sent_id": 7}, {"doc_id": "doc_000287", "sent_id": 2}], "id": "train_001908", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian selected Atlas-7B for error analysis on 2026-06-14.", "evidence": [{"doc_id": "doc_000135", "sent_id": 4}], "id": "train_001909", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-05-25 moved the Milestone V1 deadline from 2026-06-16 to 2026-06-10 on 2026-05-15.", "evidence": [{"doc_id": "doc_000351", "sent_id": 1}, {"doc_id": "doc_000013", "sent_id": 8}], "id": "train_001910", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from hard-negative mining to metric smoothing on 2026-04-15.", "evidence": [{"doc_id": "doc_000026", "sent_id": 5}], "id": "train_001911", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 3 reported on 2026-05-09 that it used temporal filtering and did not use a reward model.", "evidence": [], "id": "train_001912", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Kestrel-8B for claim classification on 2026-05-28.", "evidence": [{"doc_id": "doc_000375", "sent_id": 3}], "id": "train_001913", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone L1 deadline from 2026-05-30 to 2026-05-22 on 2026-05-06.", "evidence": [{"doc_id": "doc_000015", "sent_id": 5}], "id": "train_001914", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Aster-3B for evidence retrieval on 2026-04-17.", "evidence": [{"doc_id": "doc_000070", "sent_id": 8}], "id": "train_001915", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lena Costa was assigned as the data steward for Project Saffron on 2026-06-02.", "evidence": [{"doc_id": "doc_000290", "sent_id": 7}], "id": "train_001916", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nora Bauer was assigned as the lead for Project Nereid on 2026-05-25.", "evidence": [{"doc_id": "doc_000388", "sent_id": 2}], "id": "train_001917", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Nova-7B for reranking on 2026-06-27.", "evidence": [], "id": "train_001918", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Orchid-7B achieved 0.601 macro F1 on LabQA-3 for Project Aster on 2026-04-28.", "evidence": [{"doc_id": "doc_000398", "sent_id": 7}], "id": "train_001919", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-04-13 selected Helix-8B for reranking on 2026-05-08.", "evidence": [{"doc_id": "doc_000402", "sent_id": 2}, {"doc_id": "doc_000133", "sent_id": 8}], "id": "train_001920", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Finch-3B achieved a higher evidence F1 than Nimbus-8B.", "evidence": [{"doc_id": "doc_000472", "sent_id": 7}, {"doc_id": "doc_000303", "sent_id": 2}], "id": "train_001921", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor selected Atlas-7B for calibration on 2026-05-17.", "evidence": [{"doc_id": "doc_000321", "sent_id": 4}], "id": "train_001922", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron selected Atlas-7B for error analysis on 2026-06-28.", "evidence": [{"doc_id": "doc_000295", "sent_id": 4}], "id": "train_001923", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from threshold search to contrastive tuning on 2026-04-22.", "evidence": [{"doc_id": "doc_000183", "sent_id": 4}], "id": "train_001924", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron changed its method from contrastive tuning to threshold search on 2026-04-05.", "evidence": [{"doc_id": "doc_000414", "sent_id": 5}], "id": "train_001925", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-04-07 received 3 GPUs from Node Poplar-12 on 2026-04-03.", "evidence": [{"doc_id": "doc_000103", "sent_id": 9}, {"doc_id": "doc_000348", "sent_id": 8}], "id": "train_001926", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from late interaction to cross-encoder reranking on 2026-06-03.", "evidence": [{"doc_id": "doc_000474", "sent_id": 5}], "id": "train_001927", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-8B achieved 0.869 latency efficiency score on SignalSet-2 for Project Anchor on 2026-05-16.", "evidence": [{"doc_id": "doc_000162", "sent_id": 6}], "id": "train_001928", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Lumen-3B failed on Node Spruce-03 because of an out-of-memory error on 2026-06-25.", "evidence": [{"doc_id": "doc_000251", "sent_id": 3}], "id": "train_001929", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-04-14 changed its method from cross-encoder reranking to late interaction on 2026-06-07.", "evidence": [{"doc_id": "doc_000209", "sent_id": 7}, {"doc_id": "doc_000044", "sent_id": 4}], "id": "train_001930", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-06-23 changed its method from metric smoothing to document chunking on 2026-06-24.", "evidence": [{"doc_id": "doc_000062", "sent_id": 7}, {"doc_id": "doc_000027", "sent_id": 5}], "id": "train_001931", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Theo Lind was assigned as the evaluation owner for Project Saffron on 2026-04-06.", "evidence": [{"doc_id": "doc_000092", "sent_id": 2}], "id": "train_001932", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-23 received 5 GPUs from Node Fir-10 on 2026-05-23.", "evidence": [{"doc_id": "doc_000225", "sent_id": 7}, {"doc_id": "doc_000381", "sent_id": 6}], "id": "train_001933", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid had a failed run with Helix-8B on Node Aspen-01 because of an out-of-memory error on 2026-05-29.", "evidence": [{"doc_id": "doc_000232", "sent_id": 8}], "id": "train_001934", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-04-20 changed its method from data mixing to reward reranking on 2026-05-27.", "evidence": [{"doc_id": "doc_000133", "sent_id": 2}, {"doc_id": "doc_000054", "sent_id": 5}], "id": "train_001935", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Sofia Nadir was assigned as the retrieval owner for Project Sonata on 2026-06-03.", "evidence": [], "id": "train_001936", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Quiet Retriever Study 2 reported on 2026-06-02 that it used metric smoothing and did not use a reward model.", "evidence": [], "id": "train_001937", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Pine-07 allocated 4 GPUs to Project Saffron on 2026-06-07.", "evidence": [], "id": "train_001938", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata had a failed run with Quartz-8B on Node Pine-07 because of an unstable-validation-loss error on 2026-04-04.", "evidence": [], "id": "train_001939", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Maple-01 allocated 3 GPUs to Project Sonata on 2026-05-30.", "evidence": [{"doc_id": "doc_000391", "sent_id": 6}], "id": "train_001940", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Aster-7B on Node Willow-05 because of a missing-index error on 2026-05-03.", "evidence": [], "id": "train_001941", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-04-13 moved the Milestone D2 deadline from 2026-05-18 to 2026-05-10 on 2026-04-20.", "evidence": [{"doc_id": "doc_000039", "sent_id": 2}, {"doc_id": "doc_000367", "sent_id": 2}], "id": "train_001942", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-04-06 moved the Milestone Z1 deadline from 2026-06-23 to 2026-06-29 on 2026-06-05.", "evidence": [{"doc_id": "doc_000145", "sent_id": 2}, {"doc_id": "doc_000102", "sent_id": 8}], "id": "train_001943", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Vela-3B achieved a higher latency efficiency score than Marble-3B.", "evidence": [{"doc_id": "doc_000466", "sent_id": 7}, {"doc_id": "doc_000455", "sent_id": 2}], "id": "train_001944", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Ravi Reed was assigned as the evaluation owner on 2026-06-09 selected Kestrel-8B for error analysis on 2026-05-06.", "evidence": [{"doc_id": "doc_000384", "sent_id": 9}, {"doc_id": "doc_000394", "sent_id": 5}], "id": "train_001945", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-06-16 recorded accuracy for Aster-7B on SignalSet using Node Willow-05 on 2026-05-05.", "evidence": [{"doc_id": "doc_000463", "sent_id": 9}, {"doc_id": "doc_000263", "sent_id": 6}], "id": "train_001946", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-06-08 moved the Milestone N1 deadline from 2026-06-09 to 2026-06-04 on 2026-05-18.", "evidence": [{"doc_id": "doc_000042", "sent_id": 2}, {"doc_id": "doc_000382", "sent_id": 1}], "id": "train_001947", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kira Iyer was assigned as the evaluation owner for Project Anchor on 2026-06-30.", "evidence": [{"doc_id": "doc_000309", "sent_id": 8}], "id": "train_001948", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-24 had a run with Nimbus-7B that failed because of a checkpoint-mismatch error on 2026-06-05 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000226", "sent_id": 4}, {"doc_id": "doc_000471", "sent_id": 8}], "id": "train_001949", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-8B for evidence retrieval on 2026-06-11.", "evidence": [{"doc_id": "doc_000259", "sent_id": 3}], "id": "train_001950", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Talia Marin was assigned as the lead for Project Aster on 2026-06-22.", "evidence": [{"doc_id": "doc_000483", "sent_id": 2}], "id": "train_001951", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Atlas-3B achieved 0.684 macro F1 on RiverBench-2 for Project Nereid on 2026-04-18.", "evidence": [{"doc_id": "doc_000019", "sent_id": 6}], "id": "train_001952", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid moved the Milestone P1 deadline to 2026-04-30 on 2026-04-08.", "evidence": [{"doc_id": "doc_000292", "sent_id": 5}], "id": "train_001953", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata selected Mira-8B for calibration on 2026-04-16.", "evidence": [{"doc_id": "doc_000064", "sent_id": 3}], "id": "train_001954", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-05-19.", "evidence": [{"doc_id": "doc_000284", "sent_id": 7}], "id": "train_001955", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-04-28 selected Cedar-8B for calibration on 2026-05-07.", "evidence": [{"doc_id": "doc_000402", "sent_id": 7}, {"doc_id": "doc_000448", "sent_id": 3}], "id": "train_001956", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Cedar-3B achieved a higher latency efficiency score than Quartz-8B.", "evidence": [{"doc_id": "doc_000327", "sent_id": 6}, {"doc_id": "doc_000103", "sent_id": 8}], "id": "train_001957", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron selected Marble-3B for error analysis on 2026-04-24.", "evidence": [{"doc_id": "doc_000092", "sent_id": 8}], "id": "train_001958", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Orchid-8B achieved 0.860 evidence F1 on LabQA-3 for Project Aster on 2026-04-28.", "evidence": [], "id": "train_001959", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Rohan Singh was assigned as the retrieval owner for Project Sonata on 2026-06-24.", "evidence": [], "id": "train_001960", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from hard-negative mining to LoRA adaptation on 2026-06-14.", "evidence": [{"doc_id": "doc_000405", "sent_id": 4}], "id": "train_001961", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone X1 deadline from 2026-07-09 to 2026-07-15 on 2026-06-18.", "evidence": [], "id": "train_001962", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron changed its method from structured prompting to QLoRA adaptation on 2026-04-20.", "evidence": [], "id": "train_001963", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared evidence F1 runs, Helix-7B achieved a higher evidence F1 than Finch-7B.", "evidence": [{"doc_id": "doc_000067", "sent_id": 2}, {"doc_id": "doc_000130", "sent_id": 7}], "id": "train_001964", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Quiet Retriever Study 2 reported on 2026-06-27 that it used dense retrieval and did not use a reward model.", "evidence": [], "id": "train_001965", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-14 recorded macro F1 for Mira-7B on TraceEval-3 using Node Cedar-02 on 2026-06-08.", "evidence": [{"doc_id": "doc_000216", "sent_id": 4}, {"doc_id": "doc_000046", "sent_id": 2}], "id": "train_001966", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Vector Lantern Study 1 reported on 2026-05-29 that it used confidence calibration and used a reward model.", "evidence": [{"doc_id": "doc_000325", "sent_id": 7}], "id": "train_001967", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-06-22 changed its method from data mixing to contrastive tuning on 2026-04-25.", "evidence": [{"doc_id": "doc_000365", "sent_id": 2}, {"doc_id": "doc_000010", "sent_id": 5}], "id": "train_001968", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Nadia Chen was assigned as the evaluation owner for Project Meridian on 2026-05-06.", "evidence": [], "id": "train_001969", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Cedar-3B for error analysis on 2026-05-18.", "evidence": [], "id": "train_001970", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Kestrel-8B failed because of a missing-index error on 2026-04-16 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000176", "sent_id": 3}], "id": "train_001971", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Willow-05 allocated 4 GPUs to Project Saffron on 2026-05-17.", "evidence": [], "id": "train_001972", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Noah Chen was assigned as the retrieval owner on 2026-04-06 received 3 GPUs from Node Spruce-03 on 2026-05-22.", "evidence": [{"doc_id": "doc_000271", "sent_id": 2}, {"doc_id": "doc_000100", "sent_id": 8}], "id": "train_001973", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Quartz-3B for claim classification on 2026-04-20.", "evidence": [], "id": "train_001974", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Atlas-7B for reranking on 2026-05-10.", "evidence": [{"doc_id": "doc_000041", "sent_id": 4}], "id": "train_001975", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Arun Kwan was assigned as the data steward for Project Anchor on 2026-05-04.", "evidence": [{"doc_id": "doc_000117", "sent_id": 2}], "id": "train_001976", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Orchid-7B failed because of an unstable-validation-loss error on 2026-05-07 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000134", "sent_id": 3}], "id": "train_001977", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-07-24 on 2026-06-24.", "evidence": [{"doc_id": "doc_000288", "sent_id": 7}], "id": "train_001978", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-06-02 moved the Milestone B2 deadline to 2026-05-16 on 2026-04-27.", "evidence": [{"doc_id": "doc_000212", "sent_id": 3}, {"doc_id": "doc_000485", "sent_id": 1}], "id": "train_001979", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Finch-8B for error analysis on 2026-05-21.", "evidence": [{"doc_id": "doc_000323", "sent_id": 3}], "id": "train_001980", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-3B for claim classification on 2026-04-17.", "evidence": [{"doc_id": "doc_000414", "sent_id": 12}], "id": "train_001981", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-07-01 on 2026-06-11.", "evidence": [], "id": "train_001982", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Aster-7B achieved 0.720 accuracy on SignalSet for Project Saffron on 2026-06-16.", "evidence": [{"doc_id": "doc_000007", "sent_id": 7}], "id": "train_001983", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata changed its method from chain verification to structured prompting on 2026-04-02.", "evidence": [], "id": "train_001984", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Mira-7B failed because of a missing-index error on 2026-05-01 while using Node Willow-05.", "evidence": [{"doc_id": "doc_000024", "sent_id": 8}], "id": "train_001985", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared latency efficiency score runs, Nova-7B achieved a higher latency efficiency score than Marble-3B.", "evidence": [{"doc_id": "doc_000268", "sent_id": 6}, {"doc_id": "doc_000190", "sent_id": 6}], "id": "train_001986", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor changed its method from BM25 retrieval to evidence pooling on 2026-04-27.", "evidence": [], "id": "train_001987", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Willow-05 allocated 3 GPUs to Project Sonata on 2026-05-09.", "evidence": [{"doc_id": "doc_000215", "sent_id": 6}], "id": "train_001988", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Juniper-06 allocated 3 GPUs to Project Aster on 2026-04-17.", "evidence": [], "id": "train_001989", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-01 changed its method from cross-encoder reranking to sentence pruning on 2026-06-10.", "evidence": [{"doc_id": "doc_000093", "sent_id": 2}, {"doc_id": "doc_000496", "sent_id": 5}], "id": "train_001990", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-05-26 selected Helix-3B for claim classification on 2026-06-16.", "evidence": [{"doc_id": "doc_000319", "sent_id": 7}, {"doc_id": "doc_000301", "sent_id": 6}], "id": "train_001991", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-04-13 moved the Milestone F1 deadline from 2026-05-06 to 2026-05-02 on 2026-04-24.", "evidence": [{"doc_id": "doc_000039", "sent_id": 2}, {"doc_id": "doc_000017", "sent_id": 8}], "id": "train_001992", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-7B for claim classification on 2026-04-26.", "evidence": [{"doc_id": "doc_000499", "sent_id": 4}], "id": "train_001993", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "River-3B achieved 0.813 macro F1 on RiverBench for Project Aster on 2026-05-26.", "evidence": [{"doc_id": "doc_000321", "sent_id": 7}], "id": "train_001994", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Kestrel-7B achieved 0.803 macro F1 on OrionBench-3 for Project Nereid on 2026-06-15.", "evidence": [{"doc_id": "doc_000089", "sent_id": 1}], "id": "train_001995", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Darian Hale was assigned as the retrieval owner for Project Nereid on 2026-04-14.", "evidence": [{"doc_id": "doc_000070", "sent_id": 7}], "id": "train_001996", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Julian Gray was assigned as the lead on 2026-04-14 changed its method from LoRA adaptation to hard-negative mining on 2026-06-14.", "evidence": [{"doc_id": "doc_000265", "sent_id": 8}, {"doc_id": "doc_000440", "sent_id": 4}], "id": "train_001997", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-05-11.", "evidence": [{"doc_id": "doc_000214", "sent_id": 2}], "id": "train_001998", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid's run with Kestrel-3B failed on Node Birch-04 because of an out-of-memory error on 2026-06-04.", "evidence": [{"doc_id": "doc_000356", "sent_id": 3}], "id": "train_001999", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-07-06 on 2026-06-11.", "evidence": [], "id": "train_002000", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Marble-7B achieved 0.601 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-05-02.", "evidence": [{"doc_id": "doc_000390", "sent_id": 6}], "id": "train_002001", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Mira Sato was assigned as the data steward on 2026-05-12 changed its method from threshold search to contrastive tuning on 2026-04-19.", "evidence": [{"doc_id": "doc_000015", "sent_id": 7}, {"doc_id": "doc_000039", "sent_id": 4}], "id": "train_002002", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Helix-8B for claim classification on 2026-04-21.", "evidence": [], "id": "train_002003", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Orchid-7B achieved a higher accuracy than Cedar-7B.", "evidence": [{"doc_id": "doc_000070", "sent_id": 6}, {"doc_id": "doc_000355", "sent_id": 8}], "id": "train_002004", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Orchid-3B achieved a higher accuracy than Kestrel-3B.", "evidence": [{"doc_id": "doc_000474", "sent_id": 7}, {"doc_id": "doc_000252", "sent_id": 2}], "id": "train_002005", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-04-21 changed its method from hybrid retrieval to QLoRA adaptation on 2026-04-01.", "evidence": [{"doc_id": "doc_000190", "sent_id": 7}, {"doc_id": "doc_000393", "sent_id": 5}], "id": "train_002006", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Marble-8B for error analysis on 2026-04-13.", "evidence": [], "id": "train_002007", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata changed its method from document chunking to metric smoothing on 2026-04-16.", "evidence": [], "id": "train_002008", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid moved the Milestone D1 deadline to 2026-07-09 on 2026-06-17.", "evidence": [{"doc_id": "doc_000427", "sent_id": 5}], "id": "train_002009", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster selected Atlas-7B for error analysis on 2026-04-02.", "evidence": [{"doc_id": "doc_000265", "sent_id": 4}], "id": "train_002010", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Helix-3B achieved a higher latency efficiency score than Marble-3B.", "evidence": [{"doc_id": "doc_000265", "sent_id": 7}, {"doc_id": "doc_000190", "sent_id": 6}], "id": "train_002011", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-05-25 moved the Milestone P1 deadline from 2026-04-17 to 2026-04-25 on 2026-04-09.", "evidence": [{"doc_id": "doc_000351", "sent_id": 1}, {"doc_id": "doc_000195", "sent_id": 5}], "id": "train_002012", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Kestrel-3B achieved a higher accuracy than Cedar-3B.", "evidence": [{"doc_id": "doc_000252", "sent_id": 2}, {"doc_id": "doc_000033", "sent_id": 7}], "id": "train_002013", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Kestrel-8B achieved 0.777 accuracy on CedarQA for Project Meridian on 2026-06-06.", "evidence": [{"doc_id": "doc_000408", "sent_id": 8}], "id": "train_002014", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor's run with Lumen-7B failed because of a checkpoint-mismatch error on 2026-05-29 while using Node Sycamore-13.", "evidence": [{"doc_id": "doc_000344", "sent_id": 7}], "id": "train_002015", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Finch-3B failed on Node Juniper-06 because of a missing-index error on 2026-05-08.", "evidence": [{"doc_id": "doc_000397", "sent_id": 8}], "id": "train_002016", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from cross-encoder reranking to sentence pruning on 2026-04-01.", "evidence": [{"doc_id": "doc_000318", "sent_id": 5}], "id": "train_002017", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Quartz-7B for claim classification on 2026-05-09.", "evidence": [], "id": "train_002018", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from confidence calibration to calibrated voting on 2026-06-10.", "evidence": [{"doc_id": "doc_000356", "sent_id": 5}], "id": "train_002019", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Quartz-3B for claim classification on 2026-04-03.", "evidence": [{"doc_id": "doc_000288", "sent_id": 11}], "id": "train_002020", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Elm-08 allocated 6 GPUs to Project Meridian on 2026-06-20.", "evidence": [{"doc_id": "doc_000312", "sent_id": 6}], "id": "train_002021", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Atlas-3B failed because of an unstable-validation-loss error on 2026-05-16 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000100", "sent_id": 6}], "id": "train_002022", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron's run with Aster-7B failed on Node Willow-05 because of an unstable-validation-loss error on 2026-06-19.", "evidence": [{"doc_id": "doc_000430", "sent_id": 8}], "id": "train_002023", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Cedar-8B achieved 0.797 evidence F1 on NereidNotes-3 for Project Nereid on 2026-04-20.", "evidence": [{"doc_id": "doc_000387", "sent_id": 2}], "id": "train_002024", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Quartz-8B achieved 0.753 evidence F1 on OrionBench for Project Sonata on 2026-04-21.", "evidence": [{"doc_id": "doc_000074", "sent_id": 6}], "id": "train_002025", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-06-29 that it used sentence pruning and did not use a reward model.", "evidence": [{"doc_id": "doc_000205", "sent_id": 2}], "id": "train_002026", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Owen Marin was assigned as the data steward on 2026-06-15 received 6 GPUs from Node Willow-05 on 2026-05-30.", "evidence": [{"doc_id": "doc_000288", "sent_id": 2}, {"doc_id": "doc_000471", "sent_id": 6}], "id": "train_002027", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nadia Singh was assigned as the data steward on 2026-06-29 selected Cedar-7B for error analysis on 2026-04-10.", "evidence": [{"doc_id": "doc_000292", "sent_id": 2}, {"doc_id": "doc_000103", "sent_id": 10}], "id": "train_002028", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-04-27 selected Kestrel-8B for error analysis on 2026-04-05.", "evidence": [{"doc_id": "doc_000165", "sent_id": 2}, {"doc_id": "doc_000484", "sent_id": 4}], "id": "train_002029", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Finch-7B achieved 0.806 macro F1 on NereidNotes-2 for Project Aster on 2026-06-09.", "evidence": [], "id": "train_002030", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian changed its method from metric smoothing to hard-negative mining on 2026-05-11.", "evidence": [], "id": "train_002031", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-05-09 to 2026-05-15 on 2026-04-12.", "evidence": [], "id": "train_002032", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Quartz-3B failed on Node Pine-07 because of a missing-index error on 2026-05-28.", "evidence": [{"doc_id": "doc_000474", "sent_id": 3}], "id": "train_002033", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nimbus-3B achieved 0.768 macro F1 on TraceEval-2 for Project Nereid on 2026-06-13.", "evidence": [{"doc_id": "doc_000463", "sent_id": 7}], "id": "train_002034", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with Helix-7B failed on Node Poplar-12 because of a missing-index error on 2026-06-25.", "evidence": [{"doc_id": "doc_000281", "sent_id": 2}], "id": "train_002035", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Atlas-3B achieved 0.890 accuracy on SignalSet-3 for Project Meridian on 2026-05-26.", "evidence": [{"doc_id": "doc_000433", "sent_id": 7}], "id": "train_002036", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Jonas Rios was assigned as the lead on 2026-05-25 received 3 GPUs from Node Sycamore-13 on 2026-06-26.", "evidence": [{"doc_id": "doc_000384", "sent_id": 2}, {"doc_id": "doc_000197", "sent_id": 8}], "id": "train_002037", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Atlas-7B for error analysis on 2026-05-31.", "evidence": [{"doc_id": "doc_000128", "sent_id": 4}], "id": "train_002038", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-06-16 received 4 GPUs from Node Maple-01 on 2026-05-30.", "evidence": [{"doc_id": "doc_000463", "sent_id": 9}, {"doc_id": "doc_000266", "sent_id": 6}], "id": "train_002039", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from alias expansion to temporal filtering on 2026-05-20.", "evidence": [{"doc_id": "doc_000232", "sent_id": 5}], "id": "train_002040", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-21 recorded accuracy for Mira-3B on MemoTrace-2 using Node Sycamore-13 on 2026-06-02.", "evidence": [{"doc_id": "doc_000271", "sent_id": 7}, {"doc_id": "doc_000500", "sent_id": 6}], "id": "train_002041", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Helix-7B failed on Node Poplar-12 because of an out-of-memory error on 2026-06-28.", "evidence": [], "id": "train_002042", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Vela-3B failed because of an out-of-memory error on 2026-06-19 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000218", "sent_id": 8}], "id": "train_002043", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Finch-3B achieved 0.810 latency efficiency score on VestaLogs for Project Anchor on 2026-05-11.", "evidence": [{"doc_id": "doc_000344", "sent_id": 1}], "id": "train_002044", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Camila Brooks was assigned as the data steward for Project Meridian on 2026-04-20.", "evidence": [{"doc_id": "doc_000390", "sent_id": 2}], "id": "train_002045", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Lattice Memory Study 2 reported on 2026-05-13 that it used QLoRA adaptation and used a reward model.", "evidence": [{"doc_id": "doc_000291", "sent_id": 5}], "id": "train_002046", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-06-15 received 2 GPUs from Node Poplar-12 on 2026-04-17.", "evidence": [{"doc_id": "doc_000193", "sent_id": 2}, {"doc_id": "doc_000347", "sent_id": 8}], "id": "train_002047", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Atlas-7B for calibration on 2026-05-10.", "evidence": [{"doc_id": "doc_000020", "sent_id": 4}], "id": "train_002048", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 2 reported on 2026-06-11 that it used teacher distillation and did not use a reward model.", "evidence": [], "id": "train_002049", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from reward reranking to data mixing on 2026-05-31.", "evidence": [{"doc_id": "doc_000098", "sent_id": 4}], "id": "train_002050", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-04-27 to 2026-05-01 on 2026-04-15.", "evidence": [{"doc_id": "doc_000271", "sent_id": 5}], "id": "train_002051", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Nimbus-7B for calibration on 2026-05-11.", "evidence": [], "id": "train_002052", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from document chunking to metric smoothing on 2026-06-01.", "evidence": [], "id": "train_002053", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-06-16 selected Lumen-3B for evidence retrieval on 2026-04-30.", "evidence": [{"doc_id": "doc_000461", "sent_id": 7}, {"doc_id": "doc_000015", "sent_id": 3}], "id": "train_002054", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Anika Costa was assigned as the retrieval owner for Project Aster on 2026-06-02.", "evidence": [{"doc_id": "doc_000177", "sent_id": 7}], "id": "train_002055", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Atlas-7B for error analysis on 2026-06-28.", "evidence": [{"doc_id": "doc_000422", "sent_id": 4}], "id": "train_002056", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Atlas-3B for error analysis on 2026-04-26.", "evidence": [{"doc_id": "doc_000416", "sent_id": 4}], "id": "train_002057", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Sonata changed its method from metric smoothing to document chunking on 2026-04-15.", "evidence": [{"doc_id": "doc_000455", "sent_id": 5}], "id": "train_002058", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Kestrel-8B failed because of a checkpoint-mismatch error on 2026-05-07 while using Node Juniper-06.", "evidence": [{"doc_id": "doc_000127", "sent_id": 3}], "id": "train_002059", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian changed its method from late interaction to BM25 retrieval on 2026-04-24.", "evidence": [], "id": "train_002060", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Finch-3B for error analysis on 2026-04-09.", "evidence": [{"doc_id": "doc_000092", "sent_id": 3}], "id": "train_002061", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Stable Chains Study 2 reported on 2026-06-12 that it used document chunking and did not use a reward model.", "evidence": [{"doc_id": "doc_000409", "sent_id": 8}], "id": "train_002062", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Atlas-3B achieved 0.699 latency efficiency score on SignalSet-3 for Project Meridian on 2026-05-12.", "evidence": [], "id": "train_002063", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Dr. Noah Chen was assigned as the retrieval owner for Project Aster on 2026-04-06.", "evidence": [{"doc_id": "doc_000271", "sent_id": 2}], "id": "train_002064", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from QLoRA adaptation to hybrid retrieval on 2026-05-25.", "evidence": [], "id": "train_002065", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Atlas-3B achieved 0.774 evidence F1 on RiverBench-2 for Project Nereid on 2026-05-09.", "evidence": [{"doc_id": "doc_000412", "sent_id": 6}], "id": "train_002066", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-04-14 received 5 GPUs from Node Willow-05 on 2026-05-02.", "evidence": [{"doc_id": "doc_000327", "sent_id": 7}, {"doc_id": "doc_000387", "sent_id": 6}], "id": "train_002067", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from threshold search to reward reranking on 2026-04-19.", "evidence": [{"doc_id": "doc_000289", "sent_id": 4}], "id": "train_002068", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from structured prompting to QLoRA adaptation on 2026-04-05.", "evidence": [{"doc_id": "doc_000292", "sent_id": 4}], "id": "train_002069", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-19 recorded evidence F1 for Quartz-7B on OrionBench using Node Pine-07 on 2026-04-07.", "evidence": [{"doc_id": "doc_000316", "sent_id": 8}, {"doc_id": "doc_000295", "sent_id": 7}], "id": "train_002070", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mina Adler was assigned as the retrieval owner for Project Aster on 2026-05-04.", "evidence": [{"doc_id": "doc_000426", "sent_id": 2}], "id": "train_002071", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-05-16 to 2026-05-24 on 2026-04-22.", "evidence": [{"doc_id": "doc_000234", "sent_id": 5}], "id": "train_002072", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Atlas-7B for calibration on 2026-05-03.", "evidence": [{"doc_id": "doc_000230", "sent_id": 4}], "id": "train_002073", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Finch-7B achieved a higher accuracy than Atlas-7B.", "evidence": [{"doc_id": "doc_000410", "sent_id": 6}, {"doc_id": "doc_000444", "sent_id": 4}], "id": "train_002074", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Leo Park was assigned as the evaluation owner for Project Anchor on 2026-04-07.", "evidence": [{"doc_id": "doc_000182", "sent_id": 7}], "id": "train_002075", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-07-21 to 2026-07-27 on 2026-06-28.", "evidence": [], "id": "train_002076", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-05-17 on 2026-04-23.", "evidence": [], "id": "train_002077", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Mira-8B achieved a higher macro F1 than River-7B.", "evidence": [{"doc_id": "doc_000182", "sent_id": 6}, {"doc_id": "doc_000306", "sent_id": 7}], "id": "train_002078", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-06-29 selected Marble-3B for error analysis on 2026-05-10.", "evidence": [{"doc_id": "doc_000265", "sent_id": 2}, {"doc_id": "doc_000202", "sent_id": 3}], "id": "train_002079", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian changed its method from chain verification to structured prompting on 2026-05-18.", "evidence": [], "id": "train_002080", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared macro F1 runs, Lumen-8B achieved a higher macro F1 than Marble-8B.", "evidence": [{"doc_id": "doc_000373", "sent_id": 2}, {"doc_id": "doc_000137", "sent_id": 7}], "id": "train_002081", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron's run with Marble-3B failed on Node Maple-01 because of an unstable-validation-loss error on 2026-04-09.", "evidence": [{"doc_id": "doc_000026", "sent_id": 3}], "id": "train_002082", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mina Shah was assigned as the lead on 2026-06-09 moved the Milestone B1 deadline to 2026-07-18 on 2026-06-18.", "evidence": [{"doc_id": "doc_000351", "sent_id": 6}, {"doc_id": "doc_000293", "sent_id": 3}], "id": "train_002083", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid had a failed run with Nimbus-7B on Node Birch-04 because of an unstable-validation-loss error on 2026-06-12.", "evidence": [{"doc_id": "doc_000474", "sent_id": 8}], "id": "train_002084", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-04-14 on 2026-04-09.", "evidence": [], "id": "train_002085", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-7B achieved 0.585 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-05-27.", "evidence": [], "id": "train_002086", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-06-06 that it used reward reranking and did not use a reward model.", "evidence": [], "id": "train_002087", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-8B achieved 0.780 macro F1 on LabQA for Project Nereid on 2026-04-04.", "evidence": [{"doc_id": "doc_000483", "sent_id": 7}], "id": "train_002088", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Mira-8B for calibration on 2026-04-27.", "evidence": [], "id": "train_002089", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from chain verification to structured prompting on 2026-04-03.", "evidence": [], "id": "train_002090", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-06-20 on 2026-06-04.", "evidence": [], "id": "train_002091", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-06-03 to 2026-06-09 on 2026-05-20.", "evidence": [{"doc_id": "doc_000326", "sent_id": 6}], "id": "train_002092", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-04-14 selected Quartz-7B for error analysis on 2026-06-14.", "evidence": [{"doc_id": "doc_000292", "sent_id": 7}, {"doc_id": "doc_000032", "sent_id": 4}], "id": "train_002093", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from evidence pooling to BM25 retrieval on 2026-05-20.", "evidence": [{"doc_id": "doc_000492", "sent_id": 5}], "id": "train_002094", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from calibrated voting to confidence calibration on 2026-06-29.", "evidence": [], "id": "train_002095", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid changed its method from chain verification to confidence calibration on 2026-04-29.", "evidence": [{"doc_id": "doc_000416", "sent_id": 5}], "id": "train_002096", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid changed its method from contrastive tuning to threshold search on 2026-04-03.", "evidence": [], "id": "train_002097", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Orchid-7B failed on Node Cedar-02 because of a missing-index error on 2026-05-01.", "evidence": [{"doc_id": "doc_000183", "sent_id": 7}], "id": "train_002098", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid changed its method from structured prompting to chain verification on 2026-06-24.", "evidence": [{"doc_id": "doc_000049", "sent_id": 5}], "id": "train_002099", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone L1 deadline from 2026-06-13 to 2026-06-23 on 2026-05-17.", "evidence": [], "id": "train_002100", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Atlas-7B for reranking on 2026-04-05.", "evidence": [{"doc_id": "doc_000097", "sent_id": 4}], "id": "train_002101", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-04-07 selected Cedar-7B for error analysis on 2026-05-21.", "evidence": [{"doc_id": "doc_000119", "sent_id": 9}, {"doc_id": "doc_000249", "sent_id": 4}], "id": "train_002102", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Lumen-7B for calibration on 2026-05-15.", "evidence": [{"doc_id": "doc_000412", "sent_id": 8}], "id": "train_002103", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-06-16 received 4 GPUs from Node Fir-10 on 2026-05-09.", "evidence": [{"doc_id": "doc_000463", "sent_id": 9}, {"doc_id": "doc_000252", "sent_id": 6}], "id": "train_002104", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-05-17 to 2026-05-21 on 2026-05-07.", "evidence": [], "id": "train_002105", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Quartz-7B achieved 0.618 accuracy on CedarQA-2 for Project Saffron on 2026-05-18.", "evidence": [{"doc_id": "doc_000363", "sent_id": 2}], "id": "train_002106", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Poplar-12 allocated 4 GPUs to Project Anchor on 2026-05-19.", "evidence": [], "id": "train_002107", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor changed its method from chain verification to confidence calibration on 2026-05-20.", "evidence": [{"doc_id": "doc_000337", "sent_id": 5}], "id": "train_002108", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-06-16 changed its method from cross-encoder reranking to sentence pruning on 2026-04-01.", "evidence": [{"doc_id": "doc_000093", "sent_id": 7}, {"doc_id": "doc_000318", "sent_id": 5}], "id": "train_002109", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-18 selected Helix-7B for claim classification on 2026-05-05.", "evidence": [{"doc_id": "doc_000417", "sent_id": 2}, {"doc_id": "doc_000095", "sent_id": 7}], "id": "train_002110", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Julian Stone was assigned as the retrieval owner for Project Nereid on 2026-04-21.", "evidence": [{"doc_id": "doc_000092", "sent_id": 7}], "id": "train_002111", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster changed its method from cross-encoder reranking to late interaction on 2026-04-19.", "evidence": [{"doc_id": "doc_000073", "sent_id": 4}], "id": "train_002112", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Finch-3B failed on Node Juniper-06 because of an out-of-memory error on 2026-06-13.", "evidence": [], "id": "train_002113", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Marble-8B for error analysis on 2026-06-05.", "evidence": [{"doc_id": "doc_000323", "sent_id": 8}], "id": "train_002114", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-05-13.", "evidence": [], "id": "train_002115", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared accuracy runs, Finch-8B achieved a higher accuracy than Helix-8B.", "evidence": [{"doc_id": "doc_000215", "sent_id": 7}, {"doc_id": "doc_000074", "sent_id": 1}], "id": "train_002116", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Leo Park was assigned as the evaluation owner on 2026-06-23 selected Finch-8B for error analysis on 2026-06-09.", "evidence": [{"doc_id": "doc_000062", "sent_id": 7}, {"doc_id": "doc_000106", "sent_id": 7}], "id": "train_002117", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Cedar-3B achieved a higher macro F1 than Lumen-3B.", "evidence": [{"doc_id": "doc_000443", "sent_id": 4}, {"doc_id": "doc_000279", "sent_id": 6}], "id": "train_002118", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Dr. Jonas Rios was assigned as the lead for Project Sonata on 2026-05-20.", "evidence": [], "id": "train_002119", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Atlas-7B for claim classification on 2026-05-03.", "evidence": [{"doc_id": "doc_000303", "sent_id": 4}], "id": "train_002120", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-05-05 changed its method from dense retrieval to alias expansion on 2026-05-27.", "evidence": [{"doc_id": "doc_000133", "sent_id": 7}, {"doc_id": "doc_000033", "sent_id": 5}], "id": "train_002121", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Marble-7B achieved 0.642 accuracy on VestaLogs-3 for Project Saffron on 2026-06-29.", "evidence": [{"doc_id": "doc_000097", "sent_id": 2}], "id": "train_002122", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-04-07 that it used confidence calibration and did not use a reward model.", "evidence": [], "id": "train_002123", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Rowan-09 allocated 2 GPUs to Project Anchor on 2026-04-26.", "evidence": [], "id": "train_002124", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-06-29 changed its method from reward reranking to threshold search on 2026-05-10.", "evidence": [{"doc_id": "doc_000265", "sent_id": 2}, {"doc_id": "doc_000407", "sent_id": 4}], "id": "train_002125", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-06-22 received 4 GPUs from Node Willow-05 on 2026-04-10.", "evidence": [{"doc_id": "doc_000119", "sent_id": 2}, {"doc_id": "doc_000274", "sent_id": 7}], "id": "train_002126", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Maple-01 allocated 1 GPU to Project Sonata on 2026-04-07.", "evidence": [], "id": "train_002127", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Nova-7B failed on Node Aspen-01 because of an unstable-validation-loss error on 2026-04-16.", "evidence": [{"doc_id": "doc_000183", "sent_id": 2}], "id": "train_002128", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Atlas-7B achieved a higher accuracy than Nimbus-3B.", "evidence": [{"doc_id": "doc_000208", "sent_id": 2}, {"doc_id": "doc_000353", "sent_id": 7}], "id": "train_002129", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Vera Kim was assigned as the lead for Project Nereid on 2026-06-15.", "evidence": [{"doc_id": "doc_000309", "sent_id": 2}], "id": "train_002130", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-05-26 changed its method from teacher distillation to rank fusion on 2026-05-21.", "evidence": [{"doc_id": "doc_000090", "sent_id": 7}, {"doc_id": "doc_000021", "sent_id": 3}], "id": "train_002131", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Felix Lind was assigned as the lead for Project Aster on 2026-04-06.", "evidence": [{"doc_id": "doc_000019", "sent_id": 2}], "id": "train_002132", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-05-11 changed its method from cross-encoder reranking to sentence pruning on 2026-04-01.", "evidence": [{"doc_id": "doc_000319", "sent_id": 2}, {"doc_id": "doc_000318", "sent_id": 5}], "id": "train_002133", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-05-05 changed its method from cross-encoder reranking to late interaction on 2026-05-17.", "evidence": [{"doc_id": "doc_000368", "sent_id": 7}, {"doc_id": "doc_000319", "sent_id": 4}], "id": "train_002134", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected River-7B for evidence retrieval on 2026-05-14.", "evidence": [{"doc_id": "doc_000071", "sent_id": 3}], "id": "train_002135", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Quiet Retriever Study 4 reported on 2026-04-28 that it used structured prompting and did not use a reward model.", "evidence": [], "id": "train_002136", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared latency efficiency score runs, Helix-8B achieved a higher latency efficiency score than Finch-8B.", "evidence": [{"doc_id": "doc_000240", "sent_id": 2}, {"doc_id": "doc_000302", "sent_id": 7}], "id": "train_002137", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Vela-7B achieved 0.670 evidence F1 on LabQA-2 for Project Sonata on 2026-06-29.", "evidence": [], "id": "train_002138", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from evidence pooling to calibrated voting on 2026-05-10.", "evidence": [{"doc_id": "doc_000448", "sent_id": 4}], "id": "train_002139", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-3B achieved 0.768 macro F1 on TraceEval-2 for Project Nereid on 2026-06-14.", "evidence": [], "id": "train_002140", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Elian Ames was assigned as the evaluation owner on 2026-05-04 selected Nova-3B for reranking on 2026-05-08.", "evidence": [{"doc_id": "doc_000018", "sent_id": 2}, {"doc_id": "doc_000390", "sent_id": 8}], "id": "train_002141", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-06-22 selected Nimbus-8B for calibration on 2026-05-07.", "evidence": [{"doc_id": "doc_000119", "sent_id": 2}, {"doc_id": "doc_000448", "sent_id": 3}], "id": "train_002142", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Helix-8B failed because of a missing-index error on 2026-04-09 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000278", "sent_id": 3}], "id": "train_002143", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Vela-7B achieved a higher macro F1 than Helix-7B.", "evidence": [{"doc_id": "doc_000079", "sent_id": 7}, {"doc_id": "doc_000147", "sent_id": 2}], "id": "train_002144", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-06-24 to 2026-06-30 on 2026-06-10.", "evidence": [{"doc_id": "doc_000461", "sent_id": 5}], "id": "train_002145", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone L1 deadline to 2026-05-15 on 2026-04-15.", "evidence": [{"doc_id": "doc_000490", "sent_id": 5}], "id": "train_002146", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-7B achieved 0.899 latency efficiency score on SignalSet-2 for Project Anchor on 2026-04-15.", "evidence": [], "id": "train_002147", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-05-05 selected Cedar-7B for reranking on 2026-06-28.", "evidence": [{"doc_id": "doc_000346", "sent_id": 7}, {"doc_id": "doc_000251", "sent_id": 4}], "id": "train_002148", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Atlas-7B for error analysis on 2026-05-17.", "evidence": [{"doc_id": "doc_000337", "sent_id": 4}], "id": "train_002149", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nadia Singh was assigned as the data steward for Project Meridian on 2026-06-29.", "evidence": [{"doc_id": "doc_000292", "sent_id": 2}], "id": "train_002150", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Orchid-7B achieved 0.819 macro F1 on LabQA-3 for Project Aster on 2026-05-04.", "evidence": [{"doc_id": "doc_000127", "sent_id": 2}], "id": "train_002151", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-05-05 selected Nova-8B for reranking on 2026-05-15.", "evidence": [{"doc_id": "doc_000346", "sent_id": 7}, {"doc_id": "doc_000151", "sent_id": 8}], "id": "train_002152", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata changed its method from alias expansion to temporal filtering on 2026-04-29.", "evidence": [{"doc_id": "doc_000499", "sent_id": 5}], "id": "train_002153", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Spruce-03 allocated 6 GPUs to Project Anchor on 2026-05-16.", "evidence": [{"doc_id": "doc_000450", "sent_id": 6}], "id": "train_002154", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-04-27 changed its method from metric smoothing to document chunking on 2026-04-15.", "evidence": [{"doc_id": "doc_000015", "sent_id": 2}, {"doc_id": "doc_000455", "sent_id": 5}], "id": "train_002155", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from alias expansion to dense retrieval on 2026-05-03.", "evidence": [{"doc_id": "doc_000151", "sent_id": 4}], "id": "train_002156", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Mira-8B achieved a higher evidence F1 than Nova-8B.", "evidence": [{"doc_id": "doc_000263", "sent_id": 3}, {"doc_id": "doc_000146", "sent_id": 7}], "id": "train_002157", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-04-14 received 3 GPUs from Node Cedar-02 on 2026-06-05.", "evidence": [{"doc_id": "doc_000292", "sent_id": 7}, {"doc_id": "doc_000500", "sent_id": 7}], "id": "train_002158", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Amber Ranking Study 3 reported on 2026-04-09 that it used hybrid retrieval and did not use a reward model.", "evidence": [], "id": "train_002159", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata's run with Marble-3B failed because of an out-of-memory error on 2026-05-14 while using Node Fir-10.", "evidence": [{"doc_id": "doc_000321", "sent_id": 3}], "id": "train_002160", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Orchid-8B achieved 0.845 latency efficiency score on LumenFacts for Project Anchor on 2026-04-04.", "evidence": [{"doc_id": "doc_000119", "sent_id": 8}], "id": "train_002161", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Marble-7B achieved a higher evidence F1 than Mira-8B.", "evidence": [{"doc_id": "doc_000429", "sent_id": 6}, {"doc_id": "doc_000263", "sent_id": 3}], "id": "train_002162", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Atlas-7B achieved 0.830 latency efficiency score on SignalSet-3 for Project Meridian on 2026-05-25.", "evidence": [{"doc_id": "doc_000413", "sent_id": 1}], "id": "train_002163", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Talia Reed was assigned as the retrieval owner on 2026-04-14 moved the Milestone N1 deadline to 2026-06-04 on 2026-05-18.", "evidence": [{"doc_id": "doc_000047", "sent_id": 7}, {"doc_id": "doc_000382", "sent_id": 1}], "id": "train_002164", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-11 received 6 GPUs from Node Maple-01 on 2026-04-04.", "evidence": [{"doc_id": "doc_000241", "sent_id": 2}, {"doc_id": "doc_000228", "sent_id": 6}], "id": "train_002165", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Laurel-11 allocated 3 GPUs to Project Nereid on 2026-04-11.", "evidence": [{"doc_id": "doc_000373", "sent_id": 6}], "id": "train_002166", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "River-8B achieved 0.910 accuracy on SignalSet-2 for Project Anchor on 2026-04-13.", "evidence": [{"doc_id": "doc_000254", "sent_id": 2}], "id": "train_002167", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Atlas-7B for evidence retrieval on 2026-05-10.", "evidence": [{"doc_id": "doc_000127", "sent_id": 4}], "id": "train_002168", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Poplar-12 allocated 6 GPUs to Project Anchor on 2026-06-27.", "evidence": [{"doc_id": "doc_000125", "sent_id": 6}], "id": "train_002169", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Rohan Singh was assigned as the retrieval owner for Project Sonata on 2026-04-07.", "evidence": [{"doc_id": "doc_000355", "sent_id": 9}], "id": "train_002170", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Nova-3B for reranking on 2026-06-15.", "evidence": [], "id": "train_002171", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron had a failed run with Kestrel-8B on Node Willow-05 because of a missing-index error on 2026-06-11.", "evidence": [{"doc_id": "doc_000032", "sent_id": 3}], "id": "train_002172", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Marble-8B for error analysis on 2026-05-02.", "evidence": [], "id": "train_002173", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster changed its method from query rewriting to sentence pruning on 2026-05-21.", "evidence": [], "id": "train_002174", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mara Lane was assigned as the retrieval owner for Project Sonata on 2026-04-28.", "evidence": [{"doc_id": "doc_000289", "sent_id": 7}], "id": "train_002175", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Leo Hale was assigned as the data steward for Project Anchor on 2026-06-30.", "evidence": [{"doc_id": "doc_000419", "sent_id": 8}], "id": "train_002176", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron changed its method from evidence pooling to calibrated voting on 2026-06-01.", "evidence": [], "id": "train_002177", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Helix-8B achieved 0.598 accuracy on CedarQA-3 for Project Anchor on 2026-06-30.", "evidence": [{"doc_id": "doc_000089", "sent_id": 6}], "id": "train_002178", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-05-04 moved the Milestone N1 deadline from 2026-07-11 to 2026-07-06 on 2026-06-19.", "evidence": [{"doc_id": "doc_000023", "sent_id": 2}, {"doc_id": "doc_000477", "sent_id": 8}], "id": "train_002179", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Talia Marin was assigned as the lead for Project Aster on 2026-06-08.", "evidence": [{"doc_id": "doc_000259", "sent_id": 2}], "id": "train_002180", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Lumen-3B failed because of an unstable-validation-loss error on 2026-04-16 while using Node Sycamore-13.", "evidence": [{"doc_id": "doc_000024", "sent_id": 3}], "id": "train_002181", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-05-22 that it used cross-encoder reranking and did not use a reward model.", "evidence": [{"doc_id": "doc_000478", "sent_id": 8}], "id": "train_002182", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Atlas-7B for claim classification on 2026-05-31.", "evidence": [{"doc_id": "doc_000486", "sent_id": 4}], "id": "train_002183", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Quartz-7B on Node Pine-07 because of an out-of-memory error on 2026-04-16.", "evidence": [{"doc_id": "doc_000398", "sent_id": 3}], "id": "train_002184", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-23 received 3 GPUs from Node Maple-01 on 2026-05-23.", "evidence": [{"doc_id": "doc_000225", "sent_id": 7}, {"doc_id": "doc_000232", "sent_id": 6}], "id": "train_002185", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-8B for evidence retrieval on 2026-06-05.", "evidence": [{"doc_id": "doc_000043", "sent_id": 8}], "id": "train_002186", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Nimbus-7B achieved 0.693 accuracy on MemoTrace for Project Meridian on 2026-04-11.", "evidence": [{"doc_id": "doc_000414", "sent_id": 8}], "id": "train_002187", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-03 had a run with Orchid-7B on Node Cedar-02 that failed because of an out-of-memory error on 2026-04-10.", "evidence": [{"doc_id": "doc_000485", "sent_id": 3}, {"doc_id": "doc_000311", "sent_id": 8}], "id": "train_002188", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from cross-encoder reranking to sentence pruning on 2026-05-13.", "evidence": [{"doc_id": "doc_000020", "sent_id": 5}], "id": "train_002189", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-8B for evidence retrieval on 2026-06-08.", "evidence": [], "id": "train_002190", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Mira-3B failed on Node Sycamore-13 because of a missing-index error on 2026-06-12.", "evidence": [{"doc_id": "doc_000140", "sent_id": 8}], "id": "train_002191", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid moved the Milestone B2 deadline to 2026-06-20 on 2026-06-03.", "evidence": [], "id": "train_002192", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Nested Verifier Study 1 reported on 2026-05-18 that it used calibrated voting and used a reward model.", "evidence": [{"doc_id": "doc_000317", "sent_id": 2}], "id": "train_002193", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-10 recorded evidence F1 for Marble-7B on NereidNotes using Node Juniper-06 on 2026-05-25.", "evidence": [{"doc_id": "doc_000280", "sent_id": 4}, {"doc_id": "doc_000140", "sent_id": 2}], "id": "train_002194", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Willow-05 allocated 2 GPUs to Project Saffron on 2026-04-04.", "evidence": [], "id": "train_002195", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Finch-3B for error analysis on 2026-05-02.", "evidence": [], "id": "train_002196", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Atlas-8B achieved 0.845 evidence F1 on RiverBench-2 for Project Nereid on 2026-04-13.", "evidence": [{"doc_id": "doc_000269", "sent_id": 2}], "id": "train_002197", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Vela-8B achieved 0.756 accuracy on LumenFacts-3 for Project Saffron on 2026-05-19.", "evidence": [{"doc_id": "doc_000127", "sent_id": 7}], "id": "train_002198", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Marble-3B on Node Maple-01 because of an unstable-validation-loss error on 2026-05-30.", "evidence": [], "id": "train_002199", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Laurel-11 allocated 4 GPUs to Project Meridian on 2026-06-14.", "evidence": [], "id": "train_002200", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian moved the Milestone P1 deadline from 2026-06-27 to 2026-07-05 on 2026-06-20.", "evidence": [], "id": "train_002201", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Lind was assigned as the evaluation owner on 2026-05-19 changed its method from data mixing to contrastive tuning on 2026-04-04.", "evidence": [{"doc_id": "doc_000023", "sent_id": 7}, {"doc_id": "doc_000124", "sent_id": 5}], "id": "train_002202", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with River-3B failed because of an unstable-validation-loss error on 2026-06-04 while using Node Maple-01.", "evidence": [{"doc_id": "doc_000430", "sent_id": 3}], "id": "train_002203", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "River-8B achieved 0.824 evidence F1 on RiverBench for Project Aster on 2026-06-20.", "evidence": [{"doc_id": "doc_000031", "sent_id": 6}], "id": "train_002204", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Lumen-7B achieved a higher accuracy than Vela-7B.", "evidence": [{"doc_id": "doc_000243", "sent_id": 4}, {"doc_id": "doc_000144", "sent_id": 6}], "id": "train_002205", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron changed its method from teacher distillation to query rewriting on 2026-06-15.", "evidence": [], "id": "train_002206", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-06-29 changed its method from dense retrieval to alias expansion on 2026-05-27.", "evidence": [{"doc_id": "doc_000265", "sent_id": 2}, {"doc_id": "doc_000033", "sent_id": 5}], "id": "train_002207", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-8B for reranking on 2026-05-11.", "evidence": [], "id": "train_002208", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-04-20 changed its method from query rewriting to sentence pruning on 2026-06-14.", "evidence": [{"doc_id": "doc_000133", "sent_id": 2}, {"doc_id": "doc_000237", "sent_id": 4}], "id": "train_002209", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone P1 deadline from 2026-06-07 to 2026-06-17 on 2026-05-20.", "evidence": [{"doc_id": "doc_000030", "sent_id": 5}], "id": "train_002210", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-31 had a run with River-3B on Node Rowan-09 that failed because of a checkpoint-mismatch error on 2026-04-03.", "evidence": [{"doc_id": "doc_000236", "sent_id": 4}, {"doc_id": "doc_000240", "sent_id": 8}], "id": "train_002211", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-01 recorded evidence F1 for River-7B on RiverBench using Node Rowan-09 on 2026-04-19.", "evidence": [{"doc_id": "doc_000463", "sent_id": 2}, {"doc_id": "doc_000314", "sent_id": 4}], "id": "train_002212", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Poplar-12 allocated 2 GPUs to Project Anchor on 2026-05-14.", "evidence": [], "id": "train_002213", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-05-05 recorded latency efficiency score for Lumen-3B on MemoTrace-3 using Node Spruce-03 on 2026-04-12.", "evidence": [{"doc_id": "doc_000004", "sent_id": 9}, {"doc_id": "doc_000322", "sent_id": 3}], "id": "train_002214", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron's run with Marble-8B failed on Node Maple-01 because of a checkpoint-mismatch error on 2026-04-16.", "evidence": [{"doc_id": "doc_000269", "sent_id": 3}], "id": "train_002215", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Spruce-03 allocated 5 GPUs to Project Aster on 2026-04-04.", "evidence": [{"doc_id": "doc_000281", "sent_id": 5}], "id": "train_002216", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-04-13 recorded latency efficiency score for Lumen-7B on MemoTrace-3 using Node Willow-05 on 2026-06-01.", "evidence": [{"doc_id": "doc_000159", "sent_id": 1}, {"doc_id": "doc_000034", "sent_id": 2}], "id": "train_002217", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Quartz-8B achieved a higher accuracy than Orchid-8B.", "evidence": [{"doc_id": "doc_000343", "sent_id": 6}, {"doc_id": "doc_000411", "sent_id": 3}], "id": "train_002218", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared accuracy runs, Kestrel-3B achieved a higher accuracy than Nimbus-3B.", "evidence": [{"doc_id": "doc_000229", "sent_id": 7}, {"doc_id": "doc_000196", "sent_id": 2}], "id": "train_002219", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor selected Finch-3B for error analysis on 2026-05-29.", "evidence": [{"doc_id": "doc_000138", "sent_id": 8}], "id": "train_002220", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Dr. Arun Bauer was assigned as the evaluation owner for Project Anchor on 2026-06-10.", "evidence": [], "id": "train_002221", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid selected Nova-8B for reranking on 2026-06-19.", "evidence": [{"doc_id": "doc_000461", "sent_id": 8}], "id": "train_002222", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Nova-7B failed because of an unstable-validation-loss error on 2026-06-05 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000391", "sent_id": 8}], "id": "train_002223", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Lattice Memory Study 1 reported on 2026-04-03 that it used rank fusion and used a reward model.", "evidence": [{"doc_id": "doc_000029", "sent_id": 8}], "id": "train_002224", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Aster-3B on Node Willow-05 because of an out-of-memory error on 2026-05-02.", "evidence": [], "id": "train_002225", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Evan Moss was assigned as the lead for Project Nereid on 2026-04-06.", "evidence": [{"doc_id": "doc_000061", "sent_id": 2}], "id": "train_002226", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Granite Context Study 1 reported on 2026-04-17 that it used dense retrieval and used a reward model.", "evidence": [{"doc_id": "doc_000235", "sent_id": 8}], "id": "train_002227", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-3B achieved 0.771 accuracy on VestaLogs-2 for Project Meridian on 2026-06-03.", "evidence": [], "id": "train_002228", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Marble-3B for error analysis on 2026-06-19.", "evidence": [{"doc_id": "doc_000037", "sent_id": 8}], "id": "train_002229", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Laurel-11 allocated 2 GPUs to Project Meridian on 2026-04-16.", "evidence": [], "id": "train_002230", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Theo Grant was assigned as the data steward for Project Saffron on 2026-04-21.", "evidence": [{"doc_id": "doc_000271", "sent_id": 7}], "id": "train_002231", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-06-29 that it used sentence pruning and used a reward model.", "evidence": [{"doc_id": "doc_000205", "sent_id": 2}], "id": "train_002232", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-8B for evidence retrieval on 2026-05-23.", "evidence": [], "id": "train_002233", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Lumen-7B for calibration on 2026-06-25.", "evidence": [{"doc_id": "doc_000142", "sent_id": 3}], "id": "train_002234", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Lumen-7B for calibration on 2026-05-16.", "evidence": [], "id": "train_002235", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-06-16 selected Helix-8B for claim classification on 2026-04-30.", "evidence": [{"doc_id": "doc_000461", "sent_id": 7}, {"doc_id": "doc_000462", "sent_id": 3}], "id": "train_002236", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-04-20 changed its method from BM25 retrieval to late interaction on 2026-05-16.", "evidence": [{"doc_id": "doc_000133", "sent_id": 2}, {"doc_id": "doc_000291", "sent_id": 6}], "id": "train_002237", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Pine-07 allocated 5 GPUs to Project Sonata on 2026-04-11.", "evidence": [{"doc_id": "doc_000242", "sent_id": 6}], "id": "train_002238", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Selene Kim was assigned as the evaluation owner for Project Anchor on 2026-05-26.", "evidence": [{"doc_id": "doc_000030", "sent_id": 7}], "id": "train_002239", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from document chunking to metric smoothing on 2026-05-10.", "evidence": [{"doc_id": "doc_000426", "sent_id": 4}], "id": "train_002240", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from calibrated voting to confidence calibration on 2026-06-28.", "evidence": [{"doc_id": "doc_000483", "sent_id": 4}], "id": "train_002241", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-04-09.", "evidence": [], "id": "train_002242", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Birch-04 allocated 2 GPUs to Project Meridian on 2026-06-28.", "evidence": [], "id": "train_002243", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-06-16 selected Aster-7B for evidence retrieval on 2026-04-08.", "evidence": [{"doc_id": "doc_000435", "sent_id": 7}, {"doc_id": "doc_000059", "sent_id": 4}], "id": "train_002244", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster moved the Milestone H1 deadline to 2026-04-17 on 2026-04-01.", "evidence": [{"doc_id": "doc_000355", "sent_id": 7}], "id": "train_002245", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Mira-8B achieved 0.643 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-04-22.", "evidence": [], "id": "train_002246", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Cedar-8B for error analysis on 2026-06-01.", "evidence": [], "id": "train_002247", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Helix-8B for claim classification on 2026-06-19.", "evidence": [{"doc_id": "doc_000463", "sent_id": 10}], "id": "train_002248", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Nova-8B for reranking on 2026-06-25.", "evidence": [{"doc_id": "doc_000355", "sent_id": 4}], "id": "train_002249", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-04-20 moved the Milestone R1 deadline from 2026-06-02 to 2026-05-28 on 2026-05-19.", "evidence": [{"doc_id": "doc_000260", "sent_id": 2}, {"doc_id": "doc_000164", "sent_id": 7}], "id": "train_002250", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-3B achieved 0.515 latency efficiency score on LumenFacts for Project Anchor on 2026-04-18.", "evidence": [{"doc_id": "doc_000490", "sent_id": 6}], "id": "train_002251", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Mira-8B achieved a higher accuracy than Quartz-7B.", "evidence": [{"doc_id": "doc_000189", "sent_id": 2}, {"doc_id": "doc_000147", "sent_id": 7}], "id": "train_002252", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-07-18 on 2026-06-17.", "evidence": [], "id": "train_002253", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Nova-7B achieved a higher macro F1 than River-7B.", "evidence": [{"doc_id": "doc_000259", "sent_id": 6}, {"doc_id": "doc_000306", "sent_id": 7}], "id": "train_002254", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid's run with Cedar-3B failed because of an unstable-validation-loss error on 2026-04-10 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000318", "sent_id": 8}], "id": "train_002255", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Quartz-3B achieved a higher macro F1 than Mira-7B.", "evidence": [{"doc_id": "doc_000484", "sent_id": 7}, {"doc_id": "doc_000046", "sent_id": 2}], "id": "train_002256", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Priya Vale was assigned as the evaluation owner for Project Saffron on 2026-04-07.", "evidence": [{"doc_id": "doc_000483", "sent_id": 8}], "id": "train_002257", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Nova-3B for reranking on 2026-06-12.", "evidence": [{"doc_id": "doc_000267", "sent_id": 8}], "id": "train_002258", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian's run with Atlas-8B failed because of a missing-index error on 2026-05-21 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000067", "sent_id": 3}], "id": "train_002259", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Lumen-3B achieved 0.634 accuracy on MemoTrace-3 for Project Anchor on 2026-06-02.", "evidence": [{"doc_id": "doc_000391", "sent_id": 7}], "id": "train_002260", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-04-27 moved the Milestone P1 deadline to 2026-05-06 on 2026-04-15.", "evidence": [{"doc_id": "doc_000394", "sent_id": 2}, {"doc_id": "doc_000437", "sent_id": 5}], "id": "train_002261", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Vela-7B achieved a higher evidence F1 than Aster-7B.", "evidence": [{"doc_id": "doc_000278", "sent_id": 2}, {"doc_id": "doc_000230", "sent_id": 7}], "id": "train_002262", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor selected Lumen-3B for calibration on 2026-06-11.", "evidence": [{"doc_id": "doc_000031", "sent_id": 3}], "id": "train_002263", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 4 GPUs to Project Meridian on 2026-04-25.", "evidence": [{"doc_id": "doc_000254", "sent_id": 6}], "id": "train_002264", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Finch-8B for error analysis on 2026-05-19.", "evidence": [], "id": "train_002265", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from hybrid retrieval to document chunking on 2026-06-25.", "evidence": [], "id": "train_002266", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-3B achieved 0.654 accuracy on MemoTrace-2 for Project Saffron on 2026-04-20.", "evidence": [{"doc_id": "doc_000499", "sent_id": 2}], "id": "train_002267", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Atlas-7B for claim classification on 2026-06-28.", "evidence": [{"doc_id": "doc_000228", "sent_id": 4}], "id": "train_002268", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Aster selected Helix-8B for claim classification on 2026-06-04.", "evidence": [{"doc_id": "doc_000481", "sent_id": 3}], "id": "train_002269", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Kira Frost was assigned as the data steward for Project Anchor on 2026-06-13.", "evidence": [], "id": "train_002270", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Rohan Singh was assigned as the retrieval owner for Project Sonata on 2026-06-23.", "evidence": [{"doc_id": "doc_000003", "sent_id": 7}], "id": "train_002271", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Aster-7B achieved a higher macro F1 than Vela-7B.", "evidence": [{"doc_id": "doc_000055", "sent_id": 7}, {"doc_id": "doc_000467", "sent_id": 2}], "id": "train_002272", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Silver Notes Study 4 reported on 2026-06-22 that it used data mixing and did not use a reward model.", "evidence": [{"doc_id": "doc_000445", "sent_id": 2}], "id": "train_002273", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared macro F1 runs, Nimbus-8B achieved a higher macro F1 than Atlas-3B.", "evidence": [{"doc_id": "doc_000173", "sent_id": 7}, {"doc_id": "doc_000026", "sent_id": 2}], "id": "train_002274", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Atlas-7B achieved 0.759 latency efficiency score on SignalSet-3 for Project Meridian on 2026-05-26.", "evidence": [], "id": "train_002275", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Dr. Noah Chen was assigned as the retrieval owner on 2026-04-06 recorded evidence F1 for Lumen-3B on TraceEval using Node Spruce-03 on 2026-05-17.", "evidence": [{"doc_id": "doc_000271", "sent_id": 2}, {"doc_id": "doc_000297", "sent_id": 4}], "id": "train_002276", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-19 had a run with Finch-7B on Node Juniper-06 that failed because of a missing-index error on 2026-06-13.", "evidence": [{"doc_id": "doc_000495", "sent_id": 6}, {"doc_id": "doc_000421", "sent_id": 6}], "id": "train_002277", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "River-7B achieved 0.899 accuracy on SignalSet-2 for Project Anchor on 2026-05-23.", "evidence": [{"doc_id": "doc_000241", "sent_id": 6}], "id": "train_002278", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-7B achieved 0.809 macro F1 on TraceEval-2 for Project Nereid on 2026-04-28.", "evidence": [{"doc_id": "doc_000479", "sent_id": 7}], "id": "train_002279", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-04-11 that it used LoRA adaptation and did not use a reward model.", "evidence": [], "id": "train_002280", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Lena Costa was assigned as the data steward on 2026-05-05 changed its method from threshold search to reward reranking on 2026-04-19.", "evidence": [{"doc_id": "doc_000133", "sent_id": 7}, {"doc_id": "doc_000289", "sent_id": 4}], "id": "train_002281", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Orchid-7B achieved a higher evidence F1 than Quartz-7B.", "evidence": [{"doc_id": "doc_000410", "sent_id": 3}, {"doc_id": "doc_000141", "sent_id": 6}], "id": "train_002282", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian's run with Nova-7B failed because of a missing-index error on 2026-06-25 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000311", "sent_id": 3}], "id": "train_002283", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Atlas-8B for evidence retrieval on 2026-06-07.", "evidence": [{"doc_id": "doc_000356", "sent_id": 4}], "id": "train_002284", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 1 GPU to Project Sonata on 2026-05-02.", "evidence": [], "id": "train_002285", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Saffron selected Atlas-7B for calibration on 2026-04-26.", "evidence": [{"doc_id": "doc_000397", "sent_id": 4}], "id": "train_002286", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Orchid-8B achieved 0.616 latency efficiency score on LumenFacts for Project Anchor on 2026-06-16.", "evidence": [{"doc_id": "doc_000072", "sent_id": 7}], "id": "train_002287", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-7B achieved 0.699 macro F1 on TraceEval-3 for Project Sonata on 2026-06-02.", "evidence": [{"doc_id": "doc_000054", "sent_id": 7}], "id": "train_002288", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor changed its method from calibrated voting to confidence calibration on 2026-06-07.", "evidence": [{"doc_id": "doc_000169", "sent_id": 4}], "id": "train_002289", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared evidence F1 runs, Orchid-8B achieved a higher evidence F1 than Helix-3B.", "evidence": [{"doc_id": "doc_000085", "sent_id": 6}, {"doc_id": "doc_000273", "sent_id": 4}], "id": "train_002290", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Kestrel-8B achieved 0.803 evidence F1 on OrionBench-3 for Project Nereid on 2026-06-02.", "evidence": [{"doc_id": "doc_000363", "sent_id": 7}], "id": "train_002291", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Atlas-8B for evidence retrieval on 2026-04-18.", "evidence": [], "id": "train_002292", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Helix-3B failed on Node Poplar-12 because of a checkpoint-mismatch error on 2026-05-07.", "evidence": [{"doc_id": "doc_000450", "sent_id": 3}], "id": "train_002293", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone X1 deadline to 2026-04-21 on 2026-04-15.", "evidence": [{"doc_id": "doc_000092", "sent_id": 5}], "id": "train_002294", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared evidence F1 runs, Lumen-7B achieved a higher evidence F1 than Nova-7B.", "evidence": [{"doc_id": "doc_000196", "sent_id": 7}, {"doc_id": "doc_000456", "sent_id": 2}], "id": "train_002295", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Cedar-7B achieved 0.696 macro F1 on NereidNotes-3 for Project Nereid on 2026-05-09.", "evidence": [{"doc_id": "doc_000400", "sent_id": 6}], "id": "train_002296", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-05-05 had a failed run with Nova-8B on Node Aspen-01 because of an out-of-memory error on 2026-04-09.", "evidence": [{"doc_id": "doc_000260", "sent_id": 7}, {"doc_id": "doc_000002", "sent_id": 3}], "id": "train_002297", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-05-11 changed its method from teacher distillation to rank fusion on 2026-06-14.", "evidence": [{"doc_id": "doc_000090", "sent_id": 2}, {"doc_id": "doc_000199", "sent_id": 4}], "id": "train_002298", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Quartz-3B for claim classification on 2026-05-14.", "evidence": [{"doc_id": "doc_000214", "sent_id": 3}], "id": "train_002299", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Cedar-3B for error analysis on 2026-04-04.", "evidence": [], "id": "train_002300", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid had a failed run with Nova-7B on Node Laurel-11 because of an unstable-validation-loss error on 2026-04-30.", "evidence": [{"doc_id": "doc_000230", "sent_id": 3}], "id": "train_002301", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-06-10.", "evidence": [], "id": "train_002302", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 3 reported on 2026-04-02 that it used temporal filtering and did not use a reward model.", "evidence": [], "id": "train_002303", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Felix Lind was assigned as the lead for Project Aster on 2026-04-28.", "evidence": [{"doc_id": "doc_000064", "sent_id": 7}], "id": "train_002304", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Samir Ames was assigned as the lead for Project Sonata on 2026-04-28.", "evidence": [{"doc_id": "doc_000268", "sent_id": 7}], "id": "train_002305", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Helix-8B failed because of an unstable-validation-loss error on 2026-05-08 while using Node Spruce-03.", "evidence": [{"doc_id": "doc_000423", "sent_id": 8}], "id": "train_002306", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-06-30 received 3 GPUs from Node Pine-07 on 2026-04-30.", "evidence": [{"doc_id": "doc_000193", "sent_id": 8}, {"doc_id": "doc_000110", "sent_id": 3}], "id": "train_002307", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Birch-04 allocated 2 GPUs to Project Meridian on 2026-04-12.", "evidence": [], "id": "train_002308", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 5 GPUs to Project Sonata on 2026-04-25.", "evidence": [{"doc_id": "doc_000269", "sent_id": 6}], "id": "train_002309", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Lumen-7B achieved a higher macro F1 than Quartz-7B.", "evidence": [{"doc_id": "doc_000462", "sent_id": 6}, {"doc_id": "doc_000401", "sent_id": 6}], "id": "train_002310", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from document chunking to metric smoothing on 2026-06-21.", "evidence": [{"doc_id": "doc_000309", "sent_id": 5}], "id": "train_002311", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from document chunking to metric smoothing on 2026-06-22.", "evidence": [], "id": "train_002312", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Marble-3B achieved 0.586 evidence F1 on NereidNotes for Project Sonata on 2026-05-06.", "evidence": [], "id": "train_002313", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Mina Torres was assigned as the evaluation owner for Project Anchor on 2026-05-18.", "evidence": [{"doc_id": "doc_000417", "sent_id": 2}], "id": "train_002314", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Darian Grant was assigned as the lead for Project Nereid on 2026-04-21.", "evidence": [{"doc_id": "doc_000490", "sent_id": 7}], "id": "train_002315", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from query rewriting to sentence pruning on 2026-04-27.", "evidence": [], "id": "train_002316", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Darian Grant was assigned as the lead for Project Nereid on 2026-04-13.", "evidence": [{"doc_id": "doc_000073", "sent_id": 2}], "id": "train_002317", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Marble-3B achieved a higher latency efficiency score than Lumen-3B.", "evidence": [{"doc_id": "doc_000027", "sent_id": 7}, {"doc_id": "doc_000054", "sent_id": 2}], "id": "train_002318", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nimbus-8B achieved 0.798 macro F1 on TraceEval-2 for Project Nereid on 2026-05-13.", "evidence": [], "id": "train_002319", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Quiet Retriever Study 3 reported on 2026-06-30 that it used sentence pruning and did not use a reward model.", "evidence": [], "id": "train_002320", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Birch-04 allocated 2 GPUs to Project Meridian on 2026-04-26.", "evidence": [], "id": "train_002321", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Vela-3B achieved a higher latency efficiency score than Nimbus-3B.", "evidence": [{"doc_id": "doc_000187", "sent_id": 2}, {"doc_id": "doc_000108", "sent_id": 7}], "id": "train_002322", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Hazel-14 allocated 1 GPU to Project Nereid on 2026-06-21.", "evidence": [], "id": "train_002323", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid's run with Cedar-8B failed because of an out-of-memory error on 2026-04-23 while using Node Rowan-09.", "evidence": [{"doc_id": "doc_000397", "sent_id": 3}], "id": "train_002324", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Elian Shah was assigned as the data steward for Project Meridian on 2026-05-05.", "evidence": [{"doc_id": "doc_000346", "sent_id": 7}], "id": "train_002325", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 3 reported on 2026-06-16 that it used query rewriting and did not use a reward model.", "evidence": [], "id": "train_002326", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Ravi Reed was assigned as the evaluation owner for Project Meridian on 2026-04-15.", "evidence": [], "id": "train_002327", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from alias expansion to LoRA adaptation on 2026-04-22.", "evidence": [{"doc_id": "doc_000264", "sent_id": 5}], "id": "train_002328", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Cedar-3B for error analysis on 2026-04-02.", "evidence": [{"doc_id": "doc_000255", "sent_id": 3}], "id": "train_002329", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-06-02.", "evidence": [{"doc_id": "doc_000323", "sent_id": 7}], "id": "train_002330", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-05-11 received 2 GPUs from Node Spruce-03 on 2026-04-01.", "evidence": [{"doc_id": "doc_000138", "sent_id": 2}, {"doc_id": "doc_000109", "sent_id": 4}], "id": "train_002331", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-06-14 on 2026-06-04.", "evidence": [], "id": "train_002332", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-06-14 to 2026-06-18 on 2026-05-20.", "evidence": [], "id": "train_002333", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-02 received 6 GPUs from Node Elm-08 on 2026-05-23.", "evidence": [{"doc_id": "doc_000256", "sent_id": 7}, {"doc_id": "doc_000386", "sent_id": 6}], "id": "train_002334", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron's run with Vela-3B failed because of a checkpoint-mismatch error on 2026-05-21 while using Node Maple-01.", "evidence": [{"doc_id": "doc_000391", "sent_id": 3}], "id": "train_002335", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Lena Costa was assigned as the data steward for Project Saffron on 2026-06-24.", "evidence": [], "id": "train_002336", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Dr. Julian Stone was assigned as the retrieval owner for Project Nereid on 2026-04-22.", "evidence": [], "id": "train_002337", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Granite Context Study 1 reported on 2026-04-13 that it used evidence pooling and did not use a reward model.", "evidence": [{"doc_id": "doc_000028", "sent_id": 2}], "id": "train_002338", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Atlas-7B for evidence retrieval on 2026-06-28.", "evidence": [{"doc_id": "doc_000281", "sent_id": 3}], "id": "train_002339", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Quartz-8B achieved a higher evidence F1 than Lumen-8B.", "evidence": [{"doc_id": "doc_000395", "sent_id": 6}, {"doc_id": "doc_000346", "sent_id": 6}], "id": "train_002340", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-06-01 selected Kestrel-3B for claim classification on 2026-04-16.", "evidence": [{"doc_id": "doc_000461", "sent_id": 2}, {"doc_id": "doc_000289", "sent_id": 3}], "id": "train_002341", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-29 moved the Milestone P1 deadline from 2026-06-16 to 2026-06-06 on 2026-05-19.", "evidence": [{"doc_id": "doc_000047", "sent_id": 2}, {"doc_id": "doc_000396", "sent_id": 7}], "id": "train_002342", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-06-08 had a run with Helix-8B on Node Poplar-12 that failed because of an out-of-memory error on 2026-05-14.", "evidence": [{"doc_id": "doc_000042", "sent_id": 2}, {"doc_id": "doc_000213", "sent_id": 3}], "id": "train_002343", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected River-3B for evidence retrieval on 2026-04-10.", "evidence": [{"doc_id": "doc_000483", "sent_id": 9}], "id": "train_002344", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Orchid-7B for reranking on 2026-05-28.", "evidence": [{"doc_id": "doc_000262", "sent_id": 3}], "id": "train_002345", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Stable Chains Study 2 reported on 2026-05-23 that it used chain verification and did not use a reward model.", "evidence": [], "id": "train_002346", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-05-05 selected Helix-8B for claim classification on 2026-04-22.", "evidence": [{"doc_id": "doc_000368", "sent_id": 7}, {"doc_id": "doc_000010", "sent_id": 4}], "id": "train_002347", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-06-16 changed its method from late interaction to BM25 retrieval on 2026-04-04.", "evidence": [{"doc_id": "doc_000378", "sent_id": 7}, {"doc_id": "doc_000442", "sent_id": 6}], "id": "train_002348", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Quartz-8B achieved a higher evidence F1 than River-8B.", "evidence": [{"doc_id": "doc_000074", "sent_id": 6}, {"doc_id": "doc_000192", "sent_id": 2}], "id": "train_002349", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-12 received 3 GPUs from Node Spruce-03 on 2026-06-13.", "evidence": [{"doc_id": "doc_000151", "sent_id": 7}, {"doc_id": "doc_000170", "sent_id": 6}], "id": "train_002350", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Vela-7B for reranking on 2026-04-30.", "evidence": [{"doc_id": "doc_000151", "sent_id": 3}], "id": "train_002351", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Atlas-7B for evidence retrieval on 2026-06-28.", "evidence": [{"doc_id": "doc_000393", "sent_id": 4}], "id": "train_002352", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron moved the Milestone F1 deadline from 2026-06-24 to 2026-07-02 on 2026-06-18.", "evidence": [], "id": "train_002353", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid moved the Milestone B2 deadline to 2026-05-22 on 2026-05-06.", "evidence": [{"doc_id": "doc_000151", "sent_id": 5}], "id": "train_002354", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Atlas-7B for reranking on 2026-04-19.", "evidence": [{"doc_id": "doc_000404", "sent_id": 4}], "id": "train_002355", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from sentence pruning to cross-encoder reranking on 2026-05-11.", "evidence": [], "id": "train_002356", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Felix Lind was assigned as the lead for Project Aster on 2026-04-20.", "evidence": [{"doc_id": "doc_000285", "sent_id": 2}], "id": "train_002357", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Sycamore-13 allocated 3 GPUs to Project Sonata on 2026-04-25.", "evidence": [], "id": "train_002358", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-08 changed its method from reward reranking to data mixing on 2026-05-03.", "evidence": [{"doc_id": "doc_000062", "sent_id": 2}, {"doc_id": "doc_000462", "sent_id": 4}], "id": "train_002359", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-8B achieved 0.848 evidence F1 on TraceEval for Project Aster on 2026-05-02.", "evidence": [{"doc_id": "doc_000346", "sent_id": 6}], "id": "train_002360", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-06-15 selected Helix-3B for claim classification on 2026-04-23.", "evidence": [{"doc_id": "doc_000193", "sent_id": 2}, {"doc_id": "doc_000346", "sent_id": 3}], "id": "train_002361", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Owen Marin was assigned as the data steward on 2026-06-15 recorded accuracy for Mira-3B on MemoTrace-2 using Node Sycamore-13 on 2026-04-14.", "evidence": [{"doc_id": "doc_000288", "sent_id": 2}, {"doc_id": "doc_000373", "sent_id": 7}], "id": "train_002362", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Orchid-3B failed on Node Cedar-02 because of an out-of-memory error on 2026-04-17.", "evidence": [{"doc_id": "doc_000373", "sent_id": 8}], "id": "train_002363", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Atlas-7B for claim classification on 2026-05-10.", "evidence": [{"doc_id": "doc_000134", "sent_id": 4}], "id": "train_002364", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster selected Atlas-7B for claim classification on 2026-05-17.", "evidence": [{"doc_id": "doc_000232", "sent_id": 4}], "id": "train_002365", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-06-08 moved the Milestone B1 deadline to 2026-06-17 on 2026-06-11.", "evidence": [{"doc_id": "doc_000042", "sent_id": 2}, {"doc_id": "doc_000038", "sent_id": 3}], "id": "train_002366", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-06-22 had a failed run with Mira-7B on Node Sycamore-13 because of an unstable-validation-loss error on 2026-04-18.", "evidence": [{"doc_id": "doc_000300", "sent_id": 2}, {"doc_id": "doc_000342", "sent_id": 5}], "id": "train_002367", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Mira-8B for calibration on 2026-04-10.", "evidence": [{"doc_id": "doc_000142", "sent_id": 8}], "id": "train_002368", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor's run with Helix-8B failed on Node Poplar-12 because of a missing-index error on 2026-05-16.", "evidence": [{"doc_id": "doc_000360", "sent_id": 5}], "id": "train_002369", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-06-23 on 2026-06-17.", "evidence": [{"doc_id": "doc_000405", "sent_id": 5}], "id": "train_002370", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Aster-7B for evidence retrieval on 2026-05-21.", "evidence": [{"doc_id": "doc_000043", "sent_id": 3}], "id": "train_002371", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Helix-3B for claim classification on 2026-05-09.", "evidence": [], "id": "train_002372", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-06-22 had a failed run with Aster-7B on Node Willow-05 because of an unstable-validation-loss error on 2026-04-24.", "evidence": [{"doc_id": "doc_000300", "sent_id": 2}, {"doc_id": "doc_000229", "sent_id": 8}], "id": "train_002373", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Marble-3B achieved 0.601 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-04-07.", "evidence": [], "id": "train_002374", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Nadia Chen was assigned as the evaluation owner on 2026-06-16 selected Atlas-7B for evidence retrieval on 2026-06-16.", "evidence": [{"doc_id": "doc_000093", "sent_id": 7}, {"doc_id": "doc_000359", "sent_id": 7}], "id": "train_002375", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-05-06 that it used threshold search and used a reward model.", "evidence": [{"doc_id": "doc_000358", "sent_id": 5}], "id": "train_002376", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from BM25 retrieval to late interaction on 2026-04-15.", "evidence": [{"doc_id": "doc_000074", "sent_id": 4}], "id": "train_002377", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Dr. Ravi Nadir was assigned as the data steward for Project Meridian on 2026-06-16.", "evidence": [{"doc_id": "doc_000037", "sent_id": 7}], "id": "train_002378", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian selected Nimbus-7B for calibration on 2026-06-15.", "evidence": [], "id": "train_002379", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with Finch-7B failed because of a missing-index error on 2026-05-01 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000254", "sent_id": 8}], "id": "train_002380", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-04 recorded latency efficiency score for Helix-3B on CedarQA-3 using Node Poplar-12 on 2026-06-23.", "evidence": [{"doc_id": "doc_000316", "sent_id": 2}, {"doc_id": "doc_000032", "sent_id": 7}], "id": "train_002381", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Dr. Arun Kwan was assigned as the data steward for Project Anchor on 2026-05-12.", "evidence": [{"doc_id": "doc_000395", "sent_id": 7}], "id": "train_002382", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected River-7B for evidence retrieval on 2026-04-03.", "evidence": [{"doc_id": "doc_000118", "sent_id": 8}], "id": "train_002383", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-3B achieved 0.815 accuracy on LumenFacts for Project Anchor on 2026-06-27.", "evidence": [{"doc_id": "doc_000231", "sent_id": 10}], "id": "train_002384", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Aspen-01 allocated 1 GPU to Project Nereid on 2026-06-20.", "evidence": [], "id": "train_002385", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-05-05 had a run with Kestrel-3B that failed because of an out-of-memory error on 2026-05-02 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000050", "sent_id": 7}, {"doc_id": "doc_000263", "sent_id": 5}], "id": "train_002386", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with River-8B failed on Node Rowan-09 because of a checkpoint-mismatch error on 2026-04-16.", "evidence": [{"doc_id": "doc_000264", "sent_id": 3}], "id": "train_002387", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with River-7B failed on Node Rowan-09 because of a checkpoint-mismatch error on 2026-06-12.", "evidence": [{"doc_id": "doc_000128", "sent_id": 8}], "id": "train_002388", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Delta Evidence Study 3 reported on 2026-05-01 that it used chain verification and did not use a reward model.", "evidence": [{"doc_id": "doc_000028", "sent_id": 8}], "id": "train_002389", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Mira-3B achieved 0.699 evidence F1 on TraceEval-3 for Project Sonata on 2026-06-15.", "evidence": [{"doc_id": "doc_000108", "sent_id": 2}], "id": "train_002390", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-05-05 received 6 GPUs from Node Juniper-06 on 2026-06-27.", "evidence": [{"doc_id": "doc_000004", "sent_id": 9}, {"doc_id": "doc_000377", "sent_id": 6}], "id": "train_002391", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-05-04 changed its method from alias expansion to dense retrieval on 2026-06-06.", "evidence": [{"doc_id": "doc_000023", "sent_id": 2}, {"doc_id": "doc_000081", "sent_id": 5}], "id": "train_002392", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected River-8B for evidence retrieval on 2026-04-02.", "evidence": [{"doc_id": "doc_000070", "sent_id": 3}], "id": "train_002393", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid selected Atlas-7B for claim classification on 2026-06-07.", "evidence": [{"doc_id": "doc_000218", "sent_id": 4}], "id": "train_002394", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-05-18 moved the Milestone F1 deadline from 2026-07-17 to 2026-07-11 on 2026-06-23.", "evidence": [{"doc_id": "doc_000212", "sent_id": 2}, {"doc_id": "doc_000216", "sent_id": 7}], "id": "train_002395", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron's run with Vela-7B failed on Node Fir-10 because of an out-of-memory error on 2026-05-08.", "evidence": [{"doc_id": "doc_000499", "sent_id": 8}], "id": "train_002396", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid's run with Atlas-7B failed because of an out-of-memory error on 2026-06-26 while using Node Cedar-02.", "evidence": [{"doc_id": "doc_000032", "sent_id": 8}], "id": "train_002397", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared accuracy runs, Helix-8B achieved a higher accuracy than Marble-8B.", "evidence": [{"doc_id": "doc_000092", "sent_id": 6}, {"doc_id": "doc_000244", "sent_id": 6}], "id": "train_002398", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Helix-7B achieved 0.812 evidence F1 on OrionBench-2 for Project Aster on 2026-05-30.", "evidence": [{"doc_id": "doc_000323", "sent_id": 6}], "id": "train_002399", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-05-03.", "evidence": [], "id": "train_002400", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Dr. Julian Gray was assigned as the lead for Project Nereid on 2026-05-12.", "evidence": [{"doc_id": "doc_000307", "sent_id": 7}], "id": "train_002401", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid moved the Milestone V1 deadline to 2026-04-16 on 2026-04-11.", "evidence": [], "id": "train_002402", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Pale Compass Study 1 reported on 2026-05-07 that it used threshold search and did not use a reward model.", "evidence": [], "id": "train_002403", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Evan Iyer was assigned as the retrieval owner for Project Nereid on 2026-06-23.", "evidence": [{"doc_id": "doc_000376", "sent_id": 7}], "id": "train_002404", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian moved the Milestone J1 deadline to 2026-06-27 on 2026-06-17.", "evidence": [{"doc_id": "doc_000062", "sent_id": 5}], "id": "train_002405", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from threshold search to reward reranking on 2026-04-21.", "evidence": [], "id": "train_002406", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-7B for error analysis on 2026-06-07.", "evidence": [{"doc_id": "doc_000430", "sent_id": 4}], "id": "train_002407", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Mira Sato was assigned as the data steward for Project Meridian on 2026-05-16.", "evidence": [], "id": "train_002408", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone J1 deadline from 2026-04-25 to 2026-05-01 on 2026-04-16.", "evidence": [], "id": "train_002409", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from hybrid retrieval to document chunking on 2026-05-04.", "evidence": [], "id": "train_002410", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-04-29 to 2026-05-05 on 2026-04-09.", "evidence": [], "id": "train_002411", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mira-7B achieved 0.598 macro F1 on TraceEval-3 for Project Sonata on 2026-06-09.", "evidence": [], "id": "train_002412", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mara Lane was assigned as the retrieval owner for Project Sonata on 2026-04-14.", "evidence": [{"doc_id": "doc_000414", "sent_id": 10}], "id": "train_002413", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Vela-8B achieved 0.640 evidence F1 on LabQA-2 for Project Sonata on 2026-05-03.", "evidence": [], "id": "train_002414", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Nested Verifier Study 4 reported on 2026-06-18 that it used rank fusion and did not use a reward model.", "evidence": [], "id": "train_002415", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Marble-8B for error analysis on 2026-05-01.", "evidence": [{"doc_id": "doc_000480", "sent_id": 8}], "id": "train_002416", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Atlas-3B achieved 0.879 latency efficiency score on SignalSet-3 for Project Meridian on 2026-04-06.", "evidence": [], "id": "train_002417", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Cedar-8B for error analysis on 2026-06-18.", "evidence": [{"doc_id": "doc_000248", "sent_id": 3}], "id": "train_002418", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian moved the Milestone B2 deadline from 2026-06-27 to 2026-07-05 on 2026-06-04.", "evidence": [], "id": "train_002419", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Selene Rios was assigned as the data steward for Project Anchor on 2026-06-02.", "evidence": [{"doc_id": "doc_000491", "sent_id": 7}], "id": "train_002420", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Finch-8B for error analysis on 2026-04-16.", "evidence": [{"doc_id": "doc_000480", "sent_id": 3}], "id": "train_002421", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Finch-7B for error analysis on 2026-04-17.", "evidence": [{"doc_id": "doc_000255", "sent_id": 8}], "id": "train_002422", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Juniper-06 allocated 6 GPUs to Project Anchor on 2026-04-25.", "evidence": [{"doc_id": "doc_000479", "sent_id": 6}], "id": "train_002423", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone P1 deadline from 2026-06-21 to 2026-06-27 on 2026-06-03.", "evidence": [{"doc_id": "doc_000388", "sent_id": 5}], "id": "train_002424", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from hard-negative mining to LoRA adaptation on 2026-05-08.", "evidence": [], "id": "train_002425", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-04-14 recorded evidence F1 for Mira-3B on TraceEval-3 using Node Sycamore-13 on 2026-06-09.", "evidence": [{"doc_id": "doc_000327", "sent_id": 7}, {"doc_id": "doc_000486", "sent_id": 7}], "id": "train_002426", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata had a failed run with Mira-7B on Node Sycamore-13 because of an unstable-validation-loss error on 2026-06-05.", "evidence": [{"doc_id": "doc_000033", "sent_id": 8}], "id": "train_002427", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with River-8B failed on Node Rowan-09 because of a missing-index error on 2026-04-16.", "evidence": [{"doc_id": "doc_000264", "sent_id": 3}], "id": "train_002428", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nimbus-3B achieved 0.884 latency efficiency score on MemoTrace for Project Meridian on 2026-06-30.", "evidence": [{"doc_id": "doc_000108", "sent_id": 7}], "id": "train_002429", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-8B achieved 0.886 accuracy on LumenFacts for Project Anchor on 2026-06-01.", "evidence": [{"doc_id": "doc_000356", "sent_id": 2}], "id": "train_002430", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from dense retrieval to rank fusion on 2026-05-03.", "evidence": [{"doc_id": "doc_000165", "sent_id": 4}], "id": "train_002431", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-05-25 changed its method from LoRA adaptation to alias expansion on 2026-04-11.", "evidence": [{"doc_id": "doc_000351", "sent_id": 1}, {"doc_id": "doc_000347", "sent_id": 6}], "id": "train_002432", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Pine-07 allocated 1 GPU to Project Sonata on 2026-06-07.", "evidence": [], "id": "train_002433", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone N1 deadline from 2026-04-30 to 2026-05-06 on 2026-04-08.", "evidence": [{"doc_id": "doc_000255", "sent_id": 5}], "id": "train_002434", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Cedar-3B achieved 0.767 macro F1 on NereidNotes-3 for Project Nereid on 2026-04-13.", "evidence": [{"doc_id": "doc_000404", "sent_id": 2}], "id": "train_002435", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mira-7B achieved 0.609 macro F1 on TraceEval-3 for Project Sonata on 2026-06-20.", "evidence": [{"doc_id": "doc_000062", "sent_id": 6}], "id": "train_002436", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Felix Lind was assigned as the lead for Project Aster on 2026-04-14.", "evidence": [{"doc_id": "doc_000292", "sent_id": 7}], "id": "train_002437", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Sycamore-13 allocated 5 GPUs to Project Sonata on 2026-05-30.", "evidence": [{"doc_id": "doc_000456", "sent_id": 6}], "id": "train_002438", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Kira Iyer was assigned as the evaluation owner for Project Anchor on 2026-06-16.", "evidence": [{"doc_id": "doc_000461", "sent_id": 7}], "id": "train_002439", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from alias expansion to dense retrieval on 2026-05-31.", "evidence": [{"doc_id": "doc_000267", "sent_id": 4}], "id": "train_002440", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster moved the Milestone B1 deadline to 2026-05-10 on 2026-05-05.", "evidence": [], "id": "train_002441", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared evidence F1 runs, Marble-3B achieved a higher evidence F1 than Nimbus-3B.", "evidence": [{"doc_id": "doc_000343", "sent_id": 3}, {"doc_id": "doc_000345", "sent_id": 7}], "id": "train_002442", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor's run with Orchid-3B failed because of a missing-index error on 2026-06-18 while using Node Fir-10.", "evidence": [{"doc_id": "doc_000108", "sent_id": 3}], "id": "train_002443", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Mira-8B achieved 0.553 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-05-10.", "evidence": [], "id": "train_002444", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Nora Bauer was assigned as the lead for Project Nereid on 2026-05-06.", "evidence": [], "id": "train_002445", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lumen-8B achieved 0.822 accuracy on MemoTrace-3 for Project Anchor on 2026-05-19.", "evidence": [{"doc_id": "doc_000489", "sent_id": 7}], "id": "train_002446", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Delta Evidence Study 1 reported on 2026-06-05 that it used alias expansion and used a reward model.", "evidence": [{"doc_id": "doc_000154", "sent_id": 8}], "id": "train_002447", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Darian Hale was assigned as the retrieval owner for Project Nereid on 2026-04-06.", "evidence": [{"doc_id": "doc_000114", "sent_id": 2}], "id": "train_002448", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Finch-8B achieved 0.896 evidence F1 on NereidNotes-2 for Project Aster on 2026-04-25.", "evidence": [{"doc_id": "doc_000234", "sent_id": 6}], "id": "train_002449", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian's run with Nova-3B failed because of a checkpoint-mismatch error on 2026-04-02 while using Node Sycamore-13.", "evidence": [{"doc_id": "doc_000373", "sent_id": 3}], "id": "train_002450", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Helix-8B achieved 0.527 latency efficiency score on CedarQA-3 for Project Anchor on 2026-05-09.", "evidence": [{"doc_id": "doc_000307", "sent_id": 6}], "id": "train_002451", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Vector Lantern Study 1 reported on 2026-04-22 that it used confidence calibration and did not use a reward model.", "evidence": [{"doc_id": "doc_000371", "sent_id": 5}], "id": "train_002452", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Dr. Arun Bauer was assigned as the evaluation owner for Project Anchor on 2026-06-24.", "evidence": [], "id": "train_002453", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Sofia Frost was assigned as the lead for Project Sonata on 2026-06-09.", "evidence": [{"doc_id": "doc_000452", "sent_id": 7}], "id": "train_002454", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-8B achieved 0.708 macro F1 on TraceEval-2 for Project Nereid on 2026-05-31.", "evidence": [], "id": "train_002455", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron moved the Milestone D2 deadline from 2026-06-02 to 2026-06-06 on 2026-05-14.", "evidence": [], "id": "train_002456", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Anika Costa was assigned as the retrieval owner on 2026-05-19 received 3 GPUs from Node Poplar-12 on 2026-06-04.", "evidence": [{"doc_id": "doc_000018", "sent_id": 7}, {"doc_id": "doc_000167", "sent_id": 2}], "id": "train_002457", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron selected Vela-8B for reranking on 2026-05-28.", "evidence": [{"doc_id": "doc_000267", "sent_id": 3}], "id": "train_002458", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from rank fusion to dense retrieval on 2026-04-08.", "evidence": [{"doc_id": "doc_000242", "sent_id": 5}], "id": "train_002459", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared evidence F1 runs, Orchid-8B achieved a higher evidence F1 than Mira-3B.", "evidence": [{"doc_id": "doc_000466", "sent_id": 2}, {"doc_id": "doc_000486", "sent_id": 7}], "id": "train_002460", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Priya Vale was assigned as the evaluation owner for Project Saffron on 2026-06-15.", "evidence": [{"doc_id": "doc_000231", "sent_id": 2}], "id": "train_002461", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster had a failed run with Cedar-8B on Node Spruce-03 because of an unstable-validation-loss error on 2026-04-10.", "evidence": [{"doc_id": "doc_000253", "sent_id": 8}], "id": "train_002462", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-05-31 recorded accuracy for Orchid-3B on LumenFacts using Node Pine-07 on 2026-06-15.", "evidence": [{"doc_id": "doc_000236", "sent_id": 4}, {"doc_id": "doc_000488", "sent_id": 1}], "id": "train_002463", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Cedar-02 allocated 6 GPUs to Project Anchor on 2026-06-06.", "evidence": [{"doc_id": "doc_000207", "sent_id": 6}], "id": "train_002464", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Node Sycamore-13 allocated 5 GPUs to Project Sonata on 2026-06-13.", "evidence": [{"doc_id": "doc_000072", "sent_id": 6}], "id": "train_002465", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron had a failed run with Orchid-3B on Node Willow-05 because of a checkpoint-mismatch error on 2026-06-18.", "evidence": [{"doc_id": "doc_000089", "sent_id": 2}], "id": "train_002466", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Finch-3B achieved a higher evidence F1 than Orchid-8B.", "evidence": [{"doc_id": "doc_000027", "sent_id": 2}, {"doc_id": "doc_000328", "sent_id": 6}], "id": "train_002467", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "In the compared latency efficiency score runs, Cedar-3B achieved a higher latency efficiency score than River-3B.", "evidence": [{"doc_id": "doc_000471", "sent_id": 2}, {"doc_id": "doc_000120", "sent_id": 7}], "id": "train_002468", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid's run with Nimbus-7B failed because of an unstable-validation-loss error on 2026-05-07 while using Node Rowan-09.", "evidence": [{"doc_id": "doc_000041", "sent_id": 3}], "id": "train_002469", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Finch-3B achieved 0.907 evidence F1 on NereidNotes-2 for Project Aster on 2026-06-15.", "evidence": [{"doc_id": "doc_000027", "sent_id": 2}], "id": "train_002470", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Atlas-3B failed on Node Hazel-14 because of a checkpoint-mismatch error on 2026-04-09.", "evidence": [{"doc_id": "doc_000074", "sent_id": 2}], "id": "train_002471", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Finch-7B achieved 0.836 evidence F1 on NereidNotes-2 for Project Aster on 2026-04-11.", "evidence": [{"doc_id": "doc_000246", "sent_id": 6}], "id": "train_002472", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-8B achieved 0.872 evidence F1 on OrionBench-2 for Project Aster on 2026-06-13.", "evidence": [{"doc_id": "doc_000037", "sent_id": 6}], "id": "train_002473", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Kira Iyer was assigned as the evaluation owner for Project Anchor on 2026-05-06.", "evidence": [], "id": "train_002474", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron had a failed run with Aster-3B on Node Willow-05 because of a missing-index error on 2026-06-05.", "evidence": [{"doc_id": "doc_000363", "sent_id": 8}], "id": "train_002475", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The review of Granite Context Study 3 reported on 2026-06-12 that it used metric smoothing and used a reward model.", "evidence": [{"doc_id": "doc_000283", "sent_id": 8}], "id": "train_002476", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected Orchid-7B for reranking on 2026-05-25.", "evidence": [], "id": "train_002477", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The review of Pale Compass Study 4 reported on 2026-06-05 that it used reward reranking and used a reward model.", "evidence": [{"doc_id": "doc_000418", "sent_id": 8}], "id": "train_002478", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-7B achieved 0.719 latency efficiency score on SignalSet-2 for Project Anchor on 2026-06-03.", "evidence": [], "id": "train_002479", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The review of Amber Ranking Study 1 reported on 2026-06-15 that it used teacher distillation and did not use a reward model.", "evidence": [{"doc_id": "doc_000200", "sent_id": 2}], "id": "train_002480", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Atlas-7B for evidence retrieval on 2026-04-17.", "evidence": [{"doc_id": "doc_000374", "sent_id": 8}], "id": "train_002481", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from data mixing to reward reranking on 2026-05-06.", "evidence": [{"doc_id": "doc_000303", "sent_id": 5}], "id": "train_002482", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Quartz-8B for claim classification on 2026-06-25.", "evidence": [{"doc_id": "doc_000182", "sent_id": 3}], "id": "train_002483", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from data mixing to contrastive tuning on 2026-05-07.", "evidence": [], "id": "train_002484", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Nimbus-8B for calibration on 2026-06-14.", "evidence": [{"doc_id": "doc_000261", "sent_id": 4}], "id": "train_002485", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-8B achieved 0.759 macro F1 on TraceEval-3 for Project Sonata on 2026-06-16.", "evidence": [{"doc_id": "doc_000034", "sent_id": 7}], "id": "train_002486", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira-8B achieved 0.684 latency efficiency score on MemoTrace-2 for Project Saffron on 2026-04-27.", "evidence": [{"doc_id": "doc_000173", "sent_id": 2}], "id": "train_002487", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-04-28 selected River-3B for evidence retrieval on 2026-05-31.", "evidence": [{"doc_id": "doc_000402", "sent_id": 7}, {"doc_id": "doc_000079", "sent_id": 4}], "id": "train_002488", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Marble-7B achieved 0.646 macro F1 on NereidNotes for Project Sonata on 2026-06-27.", "evidence": [{"doc_id": "doc_000309", "sent_id": 7}], "id": "train_002489", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-05-05 had a run with Kestrel-8B that failed because of an out-of-memory error on 2026-06-11 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000050", "sent_id": 7}, {"doc_id": "doc_000312", "sent_id": 3}], "id": "train_002490", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Kestrel-8B failed because of an out-of-memory error on 2026-06-05 while using Node Poplar-12.", "evidence": [{"doc_id": "doc_000456", "sent_id": 8}], "id": "train_002491", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Elm-08 allocated 3 GPUs to Project Nereid on 2026-05-02.", "evidence": [{"doc_id": "doc_000423", "sent_id": 6}], "id": "train_002492", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Poplar-12 allocated 3 GPUs to Project Aster on 2026-05-03.", "evidence": [], "id": "train_002493", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-12 had a failed run with Quartz-8B on Node Pine-07 because of a checkpoint-mismatch error on 2026-05-14.", "evidence": [{"doc_id": "doc_000186", "sent_id": 3}, {"doc_id": "doc_000035", "sent_id": 3}], "id": "train_002494", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from calibrated voting to confidence calibration on 2026-04-05.", "evidence": [{"doc_id": "doc_000374", "sent_id": 4}], "id": "train_002495", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, Atlas-8B achieved a higher macro F1 than Finch-8B.", "evidence": [{"doc_id": "doc_000320", "sent_id": 4}, {"doc_id": "doc_000411", "sent_id": 6}], "id": "train_002496", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor changed its method from data mixing to contrastive tuning on 2026-05-15.", "evidence": [], "id": "train_002497", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-06-29 changed its method from dense retrieval to rank fusion on 2026-06-14.", "evidence": [{"doc_id": "doc_000265", "sent_id": 2}, {"doc_id": "doc_000225", "sent_id": 4}], "id": "train_002498", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-7B for reranking on 2026-04-19.", "evidence": [{"doc_id": "doc_000024", "sent_id": 4}], "id": "train_002499", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-06-22 recorded evidence F1 for Aster-7B on RiverBench-3 using Node Rowan-09 on 2026-05-12.", "evidence": [{"doc_id": "doc_000300", "sent_id": 2}, {"doc_id": "doc_000230", "sent_id": 7}], "id": "train_002500", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
