{"claim": "The project where Dr. Noah Chen was assigned as the retrieval owner on 2026-04-06 selected River-8B for evidence retrieval on 2026-06-26.", "evidence": [{"doc_id": "doc_000271", "sent_id": 2}, {"doc_id": "doc_000259", "sent_id": 8}], "id": "dev_000001", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Mira-8B on Node Sycamore-13 because of a missing-index error on 2026-04-09.", "evidence": [{"doc_id": "doc_000302", "sent_id": 3}], "id": "dev_000002", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Ravi Nadir was assigned as the data steward on 2026-06-22 selected Kestrel-8B for claim classification on 2026-05-28.", "evidence": [{"doc_id": "doc_000103", "sent_id": 2}, {"doc_id": "doc_000375", "sent_id": 3}], "id": "dev_000003", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata had a failed run with Quartz-3B on Node Pine-07 because of an out-of-memory error on 2026-04-02.", "evidence": [{"doc_id": "doc_000171", "sent_id": 3}], "id": "dev_000004", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Nested Verifier Study 2 reported on 2026-06-19 that it used calibrated voting and used a reward model.", "evidence": [{"doc_id": "doc_000359", "sent_id": 8}], "id": "dev_000005", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mina Adler was assigned as the retrieval owner for Project Aster on 2026-04-29.", "evidence": [], "id": "dev_000006", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone D2 deadline to 2026-07-14 on 2026-06-11.", "evidence": [], "id": "dev_000007", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mina Torres was assigned as the retrieval owner for Project Sonata on 2026-06-08.", "evidence": [{"doc_id": "doc_000204", "sent_id": 2}], "id": "dev_000008", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Quartz-8B for claim classification on 2026-05-17.", "evidence": [{"doc_id": "doc_000009", "sent_id": 4}], "id": "dev_000009", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Helix-8B achieved 0.797 latency efficiency score on CedarQA-3 for Project Anchor on 2026-06-15.", "evidence": [{"doc_id": "doc_000240", "sent_id": 2}], "id": "dev_000010", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster's run with Helix-7B failed on Node Poplar-12 because of an unstable-validation-loss error on 2026-04-30.", "evidence": [{"doc_id": "doc_000385", "sent_id": 3}], "id": "dev_000011", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Marble-3B achieved 0.657 evidence F1 on NereidNotes for Project Sonata on 2026-05-05.", "evidence": [{"doc_id": "doc_000296", "sent_id": 7}], "id": "dev_000012", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from data mixing to contrastive tuning on 2026-06-22.", "evidence": [], "id": "dev_000013", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Elian Ames was assigned as the evaluation owner on 2026-05-04 changed its method from cross-encoder reranking to sentence pruning on 2026-04-26.", "evidence": [{"doc_id": "doc_000018", "sent_id": 2}, {"doc_id": "doc_000025", "sent_id": 4}], "id": "dev_000014", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Iris Lane was assigned as the evaluation owner on 2026-05-04 selected Finch-3B for error analysis on 2026-05-29.", "evidence": [{"doc_id": "doc_000316", "sent_id": 2}, {"doc_id": "doc_000138", "sent_id": 8}], "id": "dev_000015", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Aster-7B on Node Willow-05 because of a checkpoint-mismatch error on 2026-06-04.", "evidence": [{"doc_id": "doc_000076", "sent_id": 3}], "id": "dev_000016", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Elm-08 allocated 3 GPUs to Project Nereid on 2026-06-20.", "evidence": [], "id": "dev_000017", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-3B achieved 0.722 latency efficiency score on VestaLogs-2 for Project Meridian on 2026-05-18.", "evidence": [{"doc_id": "doc_000471", "sent_id": 2}], "id": "dev_000018", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Nova-8B achieved a higher evidence F1 than Quartz-7B.", "evidence": [{"doc_id": "doc_000213", "sent_id": 7}, {"doc_id": "doc_000264", "sent_id": 2}], "id": "dev_000019", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Nested Verifier Study 3 reported on 2026-06-22 that it used rank fusion and used a reward model.", "evidence": [{"doc_id": "doc_000008", "sent_id": 2}], "id": "dev_000020", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from evidence pooling to calibrated voting on 2026-04-17.", "evidence": [], "id": "dev_000021", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Dr. Noah Vale was assigned as the lead on 2026-04-07 selected Helix-3B for claim classification on 2026-06-18.", "evidence": [{"doc_id": "doc_000103", "sent_id": 9}, {"doc_id": "doc_000288", "sent_id": 4}], "id": "dev_000022", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata's run with Mira-7B failed on Node Sycamore-13 because of an unstable-validation-loss error on 2026-05-21.", "evidence": [{"doc_id": "doc_000471", "sent_id": 3}], "id": "dev_000023", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata selected Vela-8B for reranking on 2026-06-14.", "evidence": [{"doc_id": "doc_000201", "sent_id": 4}], "id": "dev_000024", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from contrastive tuning to threshold search on 2026-05-20.", "evidence": [{"doc_id": "doc_000009", "sent_id": 5}], "id": "dev_000025", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from late interaction to BM25 retrieval on 2026-06-02.", "evidence": [], "id": "dev_000026", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid changed its method from temporal filtering to alias expansion on 2026-06-17.", "evidence": [{"doc_id": "doc_000121", "sent_id": 5}], "id": "dev_000027", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Owen Torres was assigned as the evaluation owner on 2026-06-16 selected Marble-7B for error analysis on 2026-05-14.", "evidence": [{"doc_id": "doc_000463", "sent_id": 9}, {"doc_id": "doc_000315", "sent_id": 3}], "id": "dev_000028", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Vela-3B achieved 0.535 accuracy on LumenFacts-3 for Project Saffron on 2026-04-27.", "evidence": [{"doc_id": "doc_000385", "sent_id": 2}], "id": "dev_000029", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Sonata changed its method from data mixing to reward reranking on 2026-04-15.", "evidence": [{"doc_id": "doc_000189", "sent_id": 5}], "id": "dev_000030", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nimbus-7B achieved 0.663 latency efficiency score on MemoTrace for Project Meridian on 2026-06-08.", "evidence": [{"doc_id": "doc_000121", "sent_id": 2}], "id": "dev_000031", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata's run with Mira-3B failed on Node Sycamore-13 because of a missing-index error on 2026-05-22.", "evidence": [{"doc_id": "doc_000194", "sent_id": 8}], "id": "dev_000032", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Birch-04 allocated 4 GPUs to Project Meridian on 2026-04-11.", "evidence": [{"doc_id": "doc_000209", "sent_id": 6}], "id": "dev_000033", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Finch-8B for error analysis on 2026-06-07.", "evidence": [{"doc_id": "doc_000076", "sent_id": 4}], "id": "dev_000034", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Felix Lind was assigned as the lead on 2026-04-14 changed its method from contrastive tuning to data mixing on 2026-04-12.", "evidence": [{"doc_id": "doc_000292", "sent_id": 7}, {"doc_id": "doc_000061", "sent_id": 4}], "id": "dev_000035", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Anika Costa was assigned as the retrieval owner on 2026-05-19 changed its method from structured prompting to chain verification on 2026-06-07.", "evidence": [{"doc_id": "doc_000018", "sent_id": 7}, {"doc_id": "doc_000461", "sent_id": 4}], "id": "dev_000036", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Maple-01 allocated 4 GPUs to Project Saffron on 2026-06-20.", "evidence": [{"doc_id": "doc_000121", "sent_id": 6}], "id": "dev_000037", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Jonas Rios was assigned as the lead on 2026-05-25 changed its method from QLoRA adaptation to structured prompting on 2026-04-01.", "evidence": [{"doc_id": "doc_000384", "sent_id": 2}, {"doc_id": "doc_000139", "sent_id": 5}], "id": "dev_000038", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Cedar-8B achieved 0.711 accuracy on VestaLogs-2 for Project Meridian on 2026-05-20.", "evidence": [], "id": "dev_000039", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Finch-3B failed on Node Juniper-06 because of a missing-index error on 2026-04-23.", "evidence": [{"doc_id": "doc_000022", "sent_id": 3}], "id": "dev_000040", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-04-27 changed its method from late interaction to cross-encoder reranking on 2026-04-15.", "evidence": [{"doc_id": "doc_000165", "sent_id": 2}, {"doc_id": "doc_000278", "sent_id": 5}], "id": "dev_000041", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-7B achieved 0.648 accuracy on CedarQA-2 for Project Saffron on 2026-05-12.", "evidence": [{"doc_id": "doc_000147", "sent_id": 7}], "id": "dev_000042", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from dense retrieval to rank fusion on 2026-05-27.", "evidence": [{"doc_id": "doc_000080", "sent_id": 5}], "id": "dev_000043", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Kestrel-3B for claim classification on 2026-06-06.", "evidence": [], "id": "dev_000044", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "In the compared latency efficiency score runs, Helix-3B achieved a higher latency efficiency score than Finch-3B.", "evidence": [{"doc_id": "doc_000484", "sent_id": 2}, {"doc_id": "doc_000387", "sent_id": 7}], "id": "dev_000045", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-06-30 changed its method from document chunking to hybrid retrieval on 2026-06-17.", "evidence": [{"doc_id": "doc_000193", "sent_id": 8}, {"doc_id": "doc_000135", "sent_id": 5}], "id": "dev_000046", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-05-04 to 2026-05-14 on 2026-04-09.", "evidence": [], "id": "dev_000047", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Nova-7B achieved 0.720 evidence F1 on LabQA for Project Nereid on 2026-05-12.", "evidence": [{"doc_id": "doc_000385", "sent_id": 7}], "id": "dev_000048", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid changed its method from BM25 retrieval to evidence pooling on 2026-06-09.", "evidence": [], "id": "dev_000049", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mina Torres was assigned as the evaluation owner for Project Anchor on 2026-06-29.", "evidence": [{"doc_id": "doc_000209", "sent_id": 2}], "id": "dev_000050", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian moved the Milestone D1 deadline from 2026-06-24 to 2026-07-02 on 2026-06-13.", "evidence": [], "id": "dev_000051", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor selected Finch-3B for error analysis on 2026-05-30.", "evidence": [], "id": "dev_000052", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Sofia Nadir was assigned as the retrieval owner for Project Sonata on 2026-06-17.", "evidence": [], "id": "dev_000053", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, River-3B achieved a higher latency efficiency score than Lumen-3B.", "evidence": [{"doc_id": "doc_000055", "sent_id": 2}, {"doc_id": "doc_000303", "sent_id": 7}], "id": "dev_000054", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster changed its method from hybrid retrieval to document chunking on 2026-06-24.", "evidence": [{"doc_id": "doc_000240", "sent_id": 5}], "id": "dev_000055", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared macro F1 runs, Aster-7B achieved a higher macro F1 than Finch-7B.", "evidence": [{"doc_id": "doc_000055", "sent_id": 7}, {"doc_id": "doc_000201", "sent_id": 2}], "id": "dev_000056", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Delta Evidence Study 2 reported on 2026-05-27 that it used document chunking and used a reward model.", "evidence": [{"doc_id": "doc_000317", "sent_id": 5}], "id": "dev_000057", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with River-8B failed because of an unstable-validation-loss error on 2026-05-15 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000147", "sent_id": 8}], "id": "dev_000058", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Aster-8B achieved 0.544 evidence F1 on RiverBench-3 for Project Sonata on 2026-04-20.", "evidence": [{"doc_id": "doc_000022", "sent_id": 2}], "id": "dev_000059", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with River-7B failed on Node Rowan-09 because of a missing-index error on 2026-04-02.", "evidence": [{"doc_id": "doc_000335", "sent_id": 2}], "id": "dev_000060", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Lumen-8B for calibration on 2026-04-06.", "evidence": [], "id": "dev_000061", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-3B achieved 0.675 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-06-29.", "evidence": [{"doc_id": "doc_000171", "sent_id": 2}], "id": "dev_000062", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from hard-negative mining to LoRA adaptation on 2026-05-06.", "evidence": [{"doc_id": "doc_000385", "sent_id": 5}], "id": "dev_000063", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Camila Quinn was assigned as the evaluation owner on 2026-04-27 selected Cedar-7B for error analysis on 2026-06-25.", "evidence": [{"doc_id": "doc_000151", "sent_id": 2}, {"doc_id": "doc_000308", "sent_id": 4}], "id": "dev_000064", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron's run with Marble-3B failed on Node Maple-01 because of an out-of-memory error on 2026-06-18.", "evidence": [{"doc_id": "doc_000120", "sent_id": 3}], "id": "dev_000065", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Evan Moss was assigned as the lead on 2026-06-30 selected Cedar-7B for error analysis on 2026-05-21.", "evidence": [{"doc_id": "doc_000288", "sent_id": 10}, {"doc_id": "doc_000249", "sent_id": 4}], "id": "dev_000066", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster's run with Orchid-3B failed on Node Cedar-02 because of an unstable-validation-loss error on 2026-04-23.", "evidence": [{"doc_id": "doc_000499", "sent_id": 3}], "id": "dev_000067", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor moved the Milestone H1 deadline from 2026-05-19 to 2026-05-29 on 2026-05-07.", "evidence": [], "id": "dev_000068", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-26 selected Kestrel-8B for claim classification on 2026-04-23.", "evidence": [{"doc_id": "doc_000241", "sent_id": 7}, {"doc_id": "doc_000036", "sent_id": 4}], "id": "dev_000069", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Mina Shah was assigned as the lead for Project Aster on 2026-04-22.", "evidence": [], "id": "dev_000070", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian moved the Milestone V1 deadline from 2026-06-16 to 2026-06-26 on 2026-05-20.", "evidence": [], "id": "dev_000071", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid selected Kestrel-7B for claim classification on 2026-05-31.", "evidence": [{"doc_id": "doc_000353", "sent_id": 4}], "id": "dev_000072", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron changed its method from confidence calibration to chain verification on 2026-04-08.", "evidence": [{"doc_id": "doc_000335", "sent_id": 4}], "id": "dev_000073", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Atlas-7B for error analysis on 2026-06-17.", "evidence": [{"doc_id": "doc_000204", "sent_id": 5}], "id": "dev_000074", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Orchid-3B achieved 0.860 macro F1 on LabQA-3 for Project Aster on 2026-04-14.", "evidence": [{"doc_id": "doc_000171", "sent_id": 7}], "id": "dev_000075", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Elm-08 allocated 4 GPUs to Project Meridian on 2026-05-12.", "evidence": [], "id": "dev_000076", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "River-8B achieved 0.794 macro F1 on RiverBench for Project Aster on 2026-05-18.", "evidence": [{"doc_id": "doc_000080", "sent_id": 2}], "id": "dev_000077", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid had a failed run with Lumen-8B on Node Birch-04 because of an out-of-memory error on 2026-05-28.", "evidence": [{"doc_id": "doc_000079", "sent_id": 3}], "id": "dev_000078", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Cedar-7B achieved 0.666 evidence F1 on NereidNotes-3 for Project Nereid on 2026-04-06.", "evidence": [{"doc_id": "doc_000302", "sent_id": 2}], "id": "dev_000079", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian changed its method from LoRA adaptation to alias expansion on 2026-06-10.", "evidence": [{"doc_id": "doc_000076", "sent_id": 5}], "id": "dev_000080", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Nimbus-7B achieved a higher accuracy than Vela-7B.", "evidence": [{"doc_id": "doc_000046", "sent_id": 7}, {"doc_id": "doc_000213", "sent_id": 2}], "id": "dev_000081", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Aspen-01 allocated 2 GPUs to Project Meridian on 2026-04-05.", "evidence": [], "id": "dev_000082", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor's run with River-3B failed on Node Rowan-09 because of a checkpoint-mismatch error on 2026-04-03.", "evidence": [{"doc_id": "doc_000240", "sent_id": 8}], "id": "dev_000083", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Aster-3B achieved 0.589 accuracy on SignalSet for Project Saffron on 2026-06-02.", "evidence": [{"doc_id": "doc_000080", "sent_id": 7}], "id": "dev_000084", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster selected Atlas-7B for evidence retrieval on 2026-05-17.", "evidence": [{"doc_id": "doc_000035", "sent_id": 4}], "id": "dev_000085", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian changed its method from confidence calibration to calibrated voting on 2026-05-20.", "evidence": [{"doc_id": "doc_000035", "sent_id": 5}], "id": "dev_000086", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian's run with Kestrel-7B failed because of a checkpoint-mismatch error on 2026-04-17 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000171", "sent_id": 8}], "id": "dev_000087", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Aster-3B for evidence retrieval on 2026-04-18.", "evidence": [], "id": "dev_000088", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian changed its method from alias expansion to dense retrieval on 2026-04-15.", "evidence": [{"doc_id": "doc_000302", "sent_id": 5}], "id": "dev_000089", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian's run with Nova-3B failed because of a checkpoint-mismatch error on 2026-06-26 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000121", "sent_id": 8}], "id": "dev_000090", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from QLoRA adaptation to hybrid retrieval on 2026-04-08.", "evidence": [{"doc_id": "doc_000171", "sent_id": 5}], "id": "dev_000091", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, River-3B achieved a higher macro F1 than Lumen-3B.", "evidence": [{"doc_id": "doc_000007", "sent_id": 2}, {"doc_id": "doc_000228", "sent_id": 7}], "id": "dev_000092", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Rowan-09 allocated 3 GPUs to Project Aster on 2026-05-16.", "evidence": [{"doc_id": "doc_000194", "sent_id": 6}], "id": "dev_000093", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Finch-3B achieved 0.862 accuracy on VestaLogs for Project Anchor on 2026-04-20.", "evidence": [{"doc_id": "doc_000296", "sent_id": 2}], "id": "dev_000094", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Vela-7B achieved 0.651 macro F1 on LabQA-2 for Project Sonata on 2026-06-09.", "evidence": [{"doc_id": "doc_000079", "sent_id": 7}], "id": "dev_000095", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Leo Hale was assigned as the data steward for Project Anchor on 2026-05-20.", "evidence": [], "id": "dev_000096", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Elm-08 allocated 3 GPUs to Project Nereid on 2026-04-26.", "evidence": [], "id": "dev_000097", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from data mixing to contrastive tuning on 2026-06-03.", "evidence": [{"doc_id": "doc_000353", "sent_id": 5}], "id": "dev_000098", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Delta Evidence Study 1 reported on 2026-06-05 that it used alias expansion and did not use a reward model.", "evidence": [{"doc_id": "doc_000154", "sent_id": 8}], "id": "dev_000099", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid's run with Kestrel-7B failed because of an unstable-validation-loss error on 2026-06-12 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000474", "sent_id": 8}], "id": "dev_000100", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-8B achieved 0.779 evidence F1 on TraceEval-2 for Project Nereid on 2026-04-21.", "evidence": [{"doc_id": "doc_000189", "sent_id": 7}], "id": "dev_000101", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Meridian changed its method from hybrid retrieval to document chunking on 2026-05-25.", "evidence": [], "id": "dev_000102", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor's run with Helix-8B failed because of a checkpoint-mismatch error on 2026-06-12 while using Node Pine-07.", "evidence": [{"doc_id": "doc_000079", "sent_id": 8}], "id": "dev_000103", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Meridian selected Atlas-7B for claim classification on 2026-04-12.", "evidence": [{"doc_id": "doc_000189", "sent_id": 4}], "id": "dev_000104", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Petra Adler was assigned as the data steward on 2026-04-28 changed its method from reward reranking to threshold search on 2026-04-19.", "evidence": [{"doc_id": "doc_000402", "sent_id": 7}, {"doc_id": "doc_000289", "sent_id": 4}], "id": "dev_000105", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Rohan Singh was assigned as the retrieval owner on 2026-04-13 selected Quartz-3B for claim classification on 2026-05-08.", "evidence": [{"doc_id": "doc_000234", "sent_id": 2}, {"doc_id": "doc_000346", "sent_id": 8}], "id": "dev_000106", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kestrel-3B achieved 0.672 macro F1 on OrionBench-3 for Project Nereid on 2026-06-01.", "evidence": [{"doc_id": "doc_000076", "sent_id": 2}], "id": "dev_000107", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Nimbus-8B achieved a higher latency efficiency score than Finch-3B.", "evidence": [{"doc_id": "doc_000228", "sent_id": 2}, {"doc_id": "doc_000387", "sent_id": 7}], "id": "dev_000108", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata selected Mira-7B for calibration on 2026-05-24.", "evidence": [{"doc_id": "doc_000080", "sent_id": 4}], "id": "dev_000109", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Mina Adler was assigned as the retrieval owner on 2026-05-12 selected Finch-8B for error analysis on 2026-04-16.", "evidence": [{"doc_id": "doc_000151", "sent_id": 7}, {"doc_id": "doc_000480", "sent_id": 3}], "id": "dev_000110", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Birch-04 allocated 1 GPU to Project Nereid on 2026-05-30.", "evidence": [{"doc_id": "doc_000080", "sent_id": 6}], "id": "dev_000111", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian had a failed run with Mira-3B on Node Hazel-14 because of a checkpoint-mismatch error on 2026-04-30.", "evidence": [{"doc_id": "doc_000147", "sent_id": 3}], "id": "dev_000112", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Nereid selected Nimbus-3B for calibration on 2026-04-26.", "evidence": [{"doc_id": "doc_000022", "sent_id": 4}], "id": "dev_000113", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Atlas-3B for evidence retrieval on 2026-04-13.", "evidence": [], "id": "dev_000114", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Elian Shah was assigned as the data steward on 2026-05-05 changed its method from QLoRA adaptation to structured prompting on 2026-05-17.", "evidence": [{"doc_id": "doc_000346", "sent_id": 7}, {"doc_id": "doc_000241", "sent_id": 4}], "id": "dev_000115", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster selected Orchid-3B for reranking on 2026-04-06.", "evidence": [], "id": "dev_000116", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Theo Lind was assigned as the evaluation owner for Project Saffron on 2026-05-20.", "evidence": [], "id": "dev_000117", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Aspen-01 allocated 4 GPUs to Project Meridian on 2026-06-27.", "evidence": [{"doc_id": "doc_000240", "sent_id": 6}], "id": "dev_000118", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Lumen-8B achieved 0.713 accuracy on MemoTrace-3 for Project Anchor on 2026-04-08.", "evidence": [], "id": "dev_000119", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Priya Vale was assigned as the evaluation owner on 2026-06-23 selected River-7B for error analysis on 2026-04-29.", "evidence": [{"doc_id": "doc_000259", "sent_id": 7}, {"doc_id": "doc_000175", "sent_id": 5}], "id": "dev_000120", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared evidence F1 runs, Helix-3B achieved a higher evidence F1 than Finch-3B.", "evidence": [{"doc_id": "doc_000223", "sent_id": 2}, {"doc_id": "doc_000472", "sent_id": 7}], "id": "dev_000121", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-20 selected Cedar-3B for error analysis on 2026-05-28.", "evidence": [{"doc_id": "doc_000004", "sent_id": 2}, {"doc_id": "doc_000408", "sent_id": 4}], "id": "dev_000122", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Priya Moss was assigned as the data steward on 2026-06-22 changed its method from LoRA adaptation to hard-negative mining on 2026-05-03.", "evidence": [{"doc_id": "doc_000119", "sent_id": 2}, {"doc_id": "doc_000468", "sent_id": 4}], "id": "dev_000123", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Juniper-06 allocated 4 GPUs to Project Anchor on 2026-04-18.", "evidence": [{"doc_id": "doc_000189", "sent_id": 6}], "id": "dev_000124", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Darian Hale was assigned as the retrieval owner on 2026-04-06 changed its method from LoRA adaptation to hard-negative mining on 2026-04-29.", "evidence": [{"doc_id": "doc_000114", "sent_id": 2}, {"doc_id": "doc_000227", "sent_id": 4}], "id": "dev_000125", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Juniper-06 allocated 1 GPU to Project Aster on 2026-06-06.", "evidence": [{"doc_id": "doc_000353", "sent_id": 6}], "id": "dev_000126", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster changed its method from temporal filtering to alias expansion on 2026-06-09.", "evidence": [], "id": "dev_000127", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "In the compared accuracy runs, Cedar-8B achieved a higher accuracy than Nova-8B.", "evidence": [{"doc_id": "doc_000472", "sent_id": 2}, {"doc_id": "doc_000467", "sent_id": 7}], "id": "dev_000128", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Nova-8B achieved 0.731 macro F1 on LabQA for Project Nereid on 2026-05-11.", "evidence": [{"doc_id": "doc_000035", "sent_id": 2}], "id": "dev_000129", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor selected River-7B for evidence retrieval on 2026-04-05.", "evidence": [{"doc_id": "doc_000171", "sent_id": 4}], "id": "dev_000130", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Atlas-7B for evidence retrieval on 2026-05-31.", "evidence": [{"doc_id": "doc_000079", "sent_id": 4}], "id": "dev_000131", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Camila Brooks was assigned as the data steward on 2026-04-28 selected Nimbus-3B for calibration on 2026-04-24.", "evidence": [{"doc_id": "doc_000234", "sent_id": 7}, {"doc_id": "doc_000437", "sent_id": 8}], "id": "dev_000132", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Aster-7B achieved a higher accuracy than Vela-7B.", "evidence": [{"doc_id": "doc_000007", "sent_id": 7}, {"doc_id": "doc_000213", "sent_id": 2}], "id": "dev_000133", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Atlas-3B achieved 0.729 latency efficiency score on SignalSet-3 for Project Meridian on 2026-05-05.", "evidence": [{"doc_id": "doc_000022", "sent_id": 7}], "id": "dev_000134", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata moved the Milestone F1 deadline to 2026-07-19 on 2026-06-26.", "evidence": [{"doc_id": "doc_000204", "sent_id": 8}], "id": "dev_000135", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Hazel-14 allocated 1 GPU to Project Nereid on 2026-06-20.", "evidence": [{"doc_id": "doc_000201", "sent_id": 6}], "id": "dev_000136", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Anika Sato was assigned as the lead on 2026-06-01 selected Orchid-3B for reranking on 2026-05-29.", "evidence": [{"doc_id": "doc_000463", "sent_id": 2}, {"doc_id": "doc_000326", "sent_id": 10}], "id": "dev_000137", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Orchid-8B achieved 0.916 accuracy on LumenFacts for Project Anchor on 2026-05-26.", "evidence": [{"doc_id": "doc_000035", "sent_id": 7}], "id": "dev_000138", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from query rewriting to sentence pruning on 2026-06-17.", "evidence": [{"doc_id": "doc_000201", "sent_id": 5}], "id": "dev_000139", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Poplar-12 allocated 3 GPUs to Project Aster on 2026-05-02.", "evidence": [{"doc_id": "doc_000022", "sent_id": 6}], "id": "dev_000140", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid moved the Milestone J1 deadline to 2026-04-27 on 2026-04-02.", "evidence": [{"doc_id": "doc_000209", "sent_id": 3}], "id": "dev_000141", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Nora Sol was assigned as the retrieval owner on 2026-05-12 changed its method from hybrid retrieval to QLoRA adaptation on 2026-05-03.", "evidence": [{"doc_id": "doc_000165", "sent_id": 7}, {"doc_id": "doc_000015", "sent_id": 4}], "id": "dev_000142", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Vela-7B achieved a higher macro F1 than Nimbus-7B.", "evidence": [{"doc_id": "doc_000467", "sent_id": 2}, {"doc_id": "doc_000479", "sent_id": 7}], "id": "dev_000143", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Aspen-01 allocated 3 GPUs to Project Nereid on 2026-05-09.", "evidence": [{"doc_id": "doc_000147", "sent_id": 6}], "id": "dev_000144", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Dr. Theo Grant was assigned as the data steward for Project Saffron on 2026-04-11.", "evidence": [], "id": "dev_000145", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Aster-3B for evidence retrieval on 2026-04-04.", "evidence": [], "id": "dev_000146", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster had a failed run with Orchid-3B on Node Cedar-02 because of an unstable-validation-loss error on 2026-04-09.", "evidence": [{"doc_id": "doc_000189", "sent_id": 3}], "id": "dev_000147", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Willow-05 allocated 6 GPUs to Project Saffron on 2026-05-30.", "evidence": [{"doc_id": "doc_000471", "sent_id": 6}], "id": "dev_000148", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Orchid-3B achieved 0.856 latency efficiency score on LumenFacts for Project Anchor on 2026-05-25.", "evidence": [{"doc_id": "doc_000079", "sent_id": 2}], "id": "dev_000149", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The review of Delta Evidence Study 3 reported on 2026-04-27 that it used BM25 retrieval and did not use a reward model.", "evidence": [{"doc_id": "doc_000075", "sent_id": 2}], "id": "dev_000150", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nova-7B achieved 0.675 latency efficiency score on LumenFacts-2 for Project Meridian on 2026-04-26.", "evidence": [], "id": "dev_000151", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Aster changed its method from evidence pooling to BM25 retrieval on 2026-04-29.", "evidence": [{"doc_id": "doc_000296", "sent_id": 5}], "id": "dev_000152", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Atlas-3B achieved 0.725 evidence F1 on RiverBench-2 for Project Nereid on 2026-06-15.", "evidence": [{"doc_id": "doc_000120", "sent_id": 2}], "id": "dev_000153", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Maple-01 allocated 3 GPUs to Project Sonata on 2026-04-12.", "evidence": [], "id": "dev_000154", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Poplar-12 allocated 3 GPUs to Project Aster on 2026-04-19.", "evidence": [], "id": "dev_000155", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Selene Kim was assigned as the evaluation owner for Project Anchor on 2026-06-27.", "evidence": [], "id": "dev_000156", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Selene Rios was assigned as the data steward on 2026-06-08 selected Orchid-7B for reranking on 2026-06-26.", "evidence": [{"doc_id": "doc_000225", "sent_id": 2}, {"doc_id": "doc_000003", "sent_id": 8}], "id": "dev_000157", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-06-11 to 2026-06-21 on 2026-05-27.", "evidence": [], "id": "dev_000158", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Aster changed its method from rank fusion to teacher distillation on 2026-06-04.", "evidence": [], "id": "dev_000159", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor selected Helix-3B for claim classification on 2026-06-14.", "evidence": [{"doc_id": "doc_000121", "sent_id": 4}], "id": "dev_000160", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Pine-07 allocated 5 GPUs to Project Sonata on 2026-06-27.", "evidence": [{"doc_id": "doc_000120", "sent_id": 6}], "id": "dev_000161", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Rohan Park was assigned as the lead on 2026-04-14 selected Vela-7B for reranking on 2026-04-30.", "evidence": [{"doc_id": "doc_000327", "sent_id": 7}, {"doc_id": "doc_000151", "sent_id": 3}], "id": "dev_000162", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Mira Nolan was assigned as the evaluation owner on 2026-06-02 selected Kestrel-8B for claim classification on 2026-05-22.", "evidence": [{"doc_id": "doc_000256", "sent_id": 7}, {"doc_id": "doc_000018", "sent_id": 8}], "id": "dev_000163", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Helix-8B achieved 0.527 latency efficiency score on CedarQA-3 for Project Anchor on 2026-05-08.", "evidence": [], "id": "dev_000164", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Maple-01 allocated 2 GPUs to Project Saffron on 2026-04-12.", "evidence": [], "id": "dev_000165", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Nimbus-8B failed because of an out-of-memory error on 2026-04-24 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000302", "sent_id": 8}], "id": "dev_000166", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Meridian selected Nimbus-8B for calibration on 2026-04-11.", "evidence": [], "id": "dev_000167", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Vera Torres was assigned as the retrieval owner on 2026-06-02 selected Nimbus-8B for reranking on 2026-06-17.", "evidence": [{"doc_id": "doc_000403", "sent_id": 7}, {"doc_id": "doc_000239", "sent_id": 5}], "id": "dev_000168", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared macro F1 runs, Vela-7B achieved a higher macro F1 than Aster-7B.", "evidence": [{"doc_id": "doc_000467", "sent_id": 2}, {"doc_id": "doc_000055", "sent_id": 7}], "id": "dev_000169", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The review of Stable Chains Study 4 reported on 2026-06-10 that it used query rewriting and used a reward model.", "evidence": [{"doc_id": "doc_000172", "sent_id": 4}], "id": "dev_000170", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Anchor moved the Milestone Z1 deadline to 2026-06-29 on 2026-06-11.", "evidence": [{"doc_id": "doc_000204", "sent_id": 3}], "id": "dev_000171", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Evan Iyer was assigned as the retrieval owner for Project Nereid on 2026-06-10.", "evidence": [], "id": "dev_000172", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron moved the Milestone R1 deadline from 2026-04-29 to 2026-05-09 on 2026-04-16.", "evidence": [], "id": "dev_000173", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-04-21 changed its method from document chunking to hybrid retrieval on 2026-05-27.", "evidence": [{"doc_id": "doc_000114", "sent_id": 7}, {"doc_id": "doc_000067", "sent_id": 5}], "id": "dev_000174", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "In the compared accuracy runs, Quartz-7B achieved a higher accuracy than Nova-8B.", "evidence": [{"doc_id": "doc_000363", "sent_id": 2}, {"doc_id": "doc_000467", "sent_id": 7}], "id": "dev_000175", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian changed its method from calibrated voting to evidence pooling on 2026-06-24.", "evidence": [{"doc_id": "doc_000120", "sent_id": 5}], "id": "dev_000176", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor had a failed run with Marble-7B on Node Spruce-03 because of a missing-index error on 2026-05-08.", "evidence": [{"doc_id": "doc_000296", "sent_id": 8}], "id": "dev_000177", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata changed its method from hybrid retrieval to document chunking on 2026-06-16.", "evidence": [], "id": "dev_000178", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 4 GPUs to Project Meridian on 2026-06-13.", "evidence": [], "id": "dev_000179", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Lumen-3B achieved 0.563 accuracy on MemoTrace-3 for Project Anchor on 2026-06-03.", "evidence": [], "id": "dev_000180", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared macro F1 runs, River-3B achieved a higher macro F1 than Quartz-3B.", "evidence": [{"doc_id": "doc_000007", "sent_id": 2}, {"doc_id": "doc_000484", "sent_id": 7}], "id": "dev_000181", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster's run with Lumen-8B failed on Node Spruce-03 because of an unstable-validation-loss error on 2026-06-26.", "evidence": [{"doc_id": "doc_000201", "sent_id": 8}], "id": "dev_000182", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Mara Lane was assigned as the retrieval owner on 2026-04-20 changed its method from alias expansion to temporal filtering on 2026-05-24.", "evidence": [{"doc_id": "doc_000346", "sent_id": 2}, {"doc_id": "doc_000476", "sent_id": 5}], "id": "dev_000183", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Quartz-7B for claim classification on 2026-06-29.", "evidence": [], "id": "dev_000184", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Finch-8B achieved 0.851 latency efficiency score on VestaLogs for Project Anchor on 2026-04-21.", "evidence": [{"doc_id": "doc_000302", "sent_id": 7}], "id": "dev_000185", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Ravi Reed was assigned as the evaluation owner on 2026-06-09 changed its method from query rewriting to teacher distillation on 2026-04-12.", "evidence": [{"doc_id": "doc_000384", "sent_id": 9}, {"doc_id": "doc_000092", "sent_id": 4}], "id": "dev_000186", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Darian Grant was assigned as the lead on 2026-04-07 changed its method from alias expansion to LoRA adaptation on 2026-06-07.", "evidence": [{"doc_id": "doc_000119", "sent_id": 9}, {"doc_id": "doc_000037", "sent_id": 4}], "id": "dev_000187", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata selected Marble-7B for error analysis on 2026-06-08.", "evidence": [], "id": "dev_000188", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Mira-8B achieved 0.594 accuracy on MemoTrace-2 for Project Saffron on 2026-04-06.", "evidence": [{"doc_id": "doc_000189", "sent_id": 2}], "id": "dev_000189", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Kira Iyer was assigned as the evaluation owner on 2026-06-16 changed its method from metric smoothing to document chunking on 2026-04-19.", "evidence": [{"doc_id": "doc_000461", "sent_id": 7}, {"doc_id": "doc_000244", "sent_id": 4}], "id": "dev_000190", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Cedar-02 allocated 3 GPUs to Project Aster on 2026-04-11.", "evidence": [{"doc_id": "doc_000335", "sent_id": 5}], "id": "dev_000191", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Sofia Frost was assigned as the lead on 2026-06-23 selected Mira-8B for calibration on 2026-06-25.", "evidence": [{"doc_id": "doc_000225", "sent_id": 7}, {"doc_id": "doc_000099", "sent_id": 3}], "id": "dev_000192", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-8B achieved 0.848 macro F1 on TraceEval for Project Aster on 2026-06-23.", "evidence": [{"doc_id": "doc_000121", "sent_id": 7}], "id": "dev_000193", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Finch-7B achieved 0.907 macro F1 on NereidNotes-2 for Project Aster on 2026-06-02.", "evidence": [{"doc_id": "doc_000471", "sent_id": 7}], "id": "dev_000194", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Nimbus-8B achieved a higher evidence F1 than Finch-3B.", "evidence": [{"doc_id": "doc_000303", "sent_id": 2}, {"doc_id": "doc_000472", "sent_id": 7}], "id": "dev_000195", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid's run with Atlas-3B failed because of an out-of-memory error on 2026-06-18 while using Node Hazel-14.", "evidence": [{"doc_id": "doc_000240", "sent_id": 3}], "id": "dev_000196", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Mira Nolan was assigned as the evaluation owner for Project Meridian on 2026-04-08.", "evidence": [], "id": "dev_000197", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from query rewriting to teacher distillation on 2026-05-06.", "evidence": [{"doc_id": "doc_000147", "sent_id": 5}], "id": "dev_000198", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from structured prompting to QLoRA adaptation on 2026-04-29.", "evidence": [{"doc_id": "doc_000022", "sent_id": 5}], "id": "dev_000199", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Rowan-09 allocated 2 GPUs to Project Anchor on 2026-06-28.", "evidence": [], "id": "dev_000200", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Theo Grant was assigned as the data steward on 2026-04-21 selected Vela-3B for reranking on 2026-06-11.", "evidence": [{"doc_id": "doc_000271", "sent_id": 7}, {"doc_id": "doc_000237", "sent_id": 3}], "id": "dev_000201", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "The project where Petra Gray was assigned as the evaluation owner on 2026-05-11 selected Vela-8B for reranking on 2026-06-18.", "evidence": [{"doc_id": "doc_000241", "sent_id": 2}, {"doc_id": "doc_000309", "sent_id": 4}], "id": "dev_000202", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron had a failed run with Mira-3B on Node Fir-10 because of a missing-index error on 2026-04-24.", "evidence": [{"doc_id": "doc_000189", "sent_id": 8}], "id": "dev_000203", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "The project where Samir Kwan was assigned as the retrieval owner on 2026-05-19 selected Aster-3B for evidence retrieval on 2026-05-28.", "evidence": [{"doc_id": "doc_000316", "sent_id": 8}, {"doc_id": "doc_000116", "sent_id": 3}], "id": "dev_000204", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-8B achieved 0.592 macro F1 on OrionBench for Project Sonata on 2026-06-30.", "evidence": [{"doc_id": "doc_000240", "sent_id": 7}], "id": "dev_000205", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Atlas-7B for calibration on 2026-06-21.", "evidence": [{"doc_id": "doc_000120", "sent_id": 4}], "id": "dev_000206", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Helix-7B achieved 0.827 accuracy on CedarQA-3 for Project Anchor on 2026-06-23.", "evidence": [], "id": "dev_000207", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Node Sycamore-13 allocated 4 GPUs to Project Saffron on 2026-04-11.", "evidence": [{"doc_id": "doc_000171", "sent_id": 6}], "id": "dev_000208", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Mira-3B on Node Sycamore-13 because of an out-of-memory error on 2026-06-12.", "evidence": [{"doc_id": "doc_000140", "sent_id": 8}], "id": "dev_000209", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Dr. Arun Bauer was assigned as the evaluation owner on 2026-05-11 changed its method from alias expansion to temporal filtering on 2026-05-03.", "evidence": [{"doc_id": "doc_000138", "sent_id": 2}, {"doc_id": "doc_000372", "sent_id": 4}], "id": "dev_000210", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Anchor changed its method from confidence calibration to chain verification on 2026-06-15.", "evidence": [], "id": "dev_000211", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Node Willow-05 allocated 3 GPUs to Project Sonata on 2026-05-20.", "evidence": [], "id": "dev_000212", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Saffron selected Marble-3B for error analysis on 2026-06-21.", "evidence": [{"doc_id": "doc_000240", "sent_id": 4}], "id": "dev_000213", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid had a failed run with Vela-8B on Node Elm-08 because of an unstable-validation-loss error on 2026-04-23.", "evidence": [{"doc_id": "doc_000296", "sent_id": 3}], "id": "dev_000214", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron's run with Quartz-8B failed because of a checkpoint-mismatch error on 2026-05-14 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000035", "sent_id": 3}], "id": "dev_000215", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Quartz-3B achieved 0.532 evidence F1 on OrionBench for Project Sonata on 2026-06-29.", "evidence": [{"doc_id": "doc_000335", "sent_id": 1}], "id": "dev_000216", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Mira-8B achieved 0.598 evidence F1 on TraceEval-3 for Project Sonata on 2026-05-26.", "evidence": [{"doc_id": "doc_000009", "sent_id": 7}], "id": "dev_000217", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster changed its method from teacher distillation to rank fusion on 2026-06-03.", "evidence": [{"doc_id": "doc_000079", "sent_id": 5}], "id": "dev_000218", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Hazel-14 allocated 3 GPUs to Project Nereid on 2026-05-19.", "evidence": [], "id": "dev_000219", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Owen Marin was assigned as the data steward on 2026-06-15 selected Mira-7B for calibration on 2026-05-22.", "evidence": [{"doc_id": "doc_000288", "sent_id": 2}, {"doc_id": "doc_000162", "sent_id": 8}], "id": "dev_000220", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Talia Marin was assigned as the lead on 2026-06-08 changed its method from hybrid retrieval to QLoRA adaptation on 2026-04-26.", "evidence": [{"doc_id": "doc_000259", "sent_id": 2}, {"doc_id": "doc_000004", "sent_id": 4}], "id": "dev_000221", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian selected Atlas-3B for evidence retrieval on 2026-06-22.", "evidence": [], "id": "dev_000222", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Nadia Singh was assigned as the data steward on 2026-06-29 changed its method from metric smoothing to document chunking on 2026-05-31.", "evidence": [{"doc_id": "doc_000292", "sent_id": 2}, {"doc_id": "doc_000262", "sent_id": 4}], "id": "dev_000223", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian had a failed run with Orchid-7B on Node Elm-08 because of a checkpoint-mismatch error on 2026-06-05.", "evidence": [{"doc_id": "doc_000471", "sent_id": 8}], "id": "dev_000224", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone Z1 deadline to 2026-07-09 on 2026-06-18.", "evidence": [], "id": "dev_000225", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Cedar-8B achieved 0.782 accuracy on VestaLogs-2 for Project Meridian on 2026-05-19.", "evidence": [{"doc_id": "doc_000194", "sent_id": 7}], "id": "dev_000226", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Marble-7B achieved 0.601 latency efficiency score on VestaLogs-3 for Project Saffron on 2026-05-03.", "evidence": [], "id": "dev_000227", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid's run with Kestrel-3B failed on Node Birch-04 because of a missing-index error on 2026-05-29.", "evidence": [{"doc_id": "doc_000035", "sent_id": 8}], "id": "dev_000228", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster moved the Milestone N1 deadline to 2026-06-09 on 2026-05-17.", "evidence": [], "id": "dev_000229", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Nimbus-7B achieved 0.753 accuracy on MemoTrace for Project Meridian on 2026-06-30.", "evidence": [], "id": "dev_000230", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Nereid's run with Nova-8B failed because of an unstable-validation-loss error on 2026-05-14 while using Node Aspen-01.", "evidence": [{"doc_id": "doc_000009", "sent_id": 3}], "id": "dev_000231", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Sofia Frost was assigned as the lead for Project Sonata on 2026-06-10.", "evidence": [], "id": "dev_000232", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Samir Ames was assigned as the lead on 2026-05-18 selected Marble-8B for error analysis on 2026-06-05.", "evidence": [{"doc_id": "doc_000256", "sent_id": 2}, {"doc_id": "doc_000323", "sent_id": 8}], "id": "dev_000233", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Vela-7B achieved 0.595 accuracy on LumenFacts-3 for Project Saffron on 2026-05-12.", "evidence": [], "id": "dev_000234", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Sonata had a failed run with Aster-8B on Node Willow-05 because of an out-of-memory error on 2026-04-17.", "evidence": [{"doc_id": "doc_000335", "sent_id": 7}], "id": "dev_000235", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron changed its method from metric smoothing to document chunking on 2026-05-13.", "evidence": [{"doc_id": "doc_000194", "sent_id": 5}], "id": "dev_000236", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "The project where Nora Bauer was assigned as the lead on 2026-05-25 selected Atlas-8B for evidence retrieval on 2026-05-01.", "evidence": [{"doc_id": "doc_000388", "sent_id": 2}, {"doc_id": "doc_000073", "sent_id": 8}], "id": "dev_000237", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "The project where Vera Kim was assigned as the lead on 2026-06-01 changed its method from calibrated voting to evidence pooling on 2026-06-21.", "evidence": [{"doc_id": "doc_000461", "sent_id": 2}, {"doc_id": "doc_000306", "sent_id": 5}], "id": "dev_000238", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Willow-05 allocated 3 GPUs to Project Sonata on 2026-04-18.", "evidence": [{"doc_id": "doc_000302", "sent_id": 6}], "id": "dev_000239", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Saffron selected Mira-8B for calibration on 2026-06-23.", "evidence": [], "id": "dev_000240", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Project Saffron moved the Milestone X1 deadline from 2026-04-24 to 2026-04-30 on 2026-04-01.", "evidence": [], "id": "dev_000241", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Sonata selected Atlas-7B for error analysis on 2026-05-03.", "evidence": [{"doc_id": "doc_000147", "sent_id": 4}], "id": "dev_000242", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The project where Felix Brooks was assigned as the retrieval owner on 2026-04-13 changed its method from data mixing to reward reranking on 2026-06-21.", "evidence": [{"doc_id": "doc_000402", "sent_id": 2}, {"doc_id": "doc_000419", "sent_id": 5}], "id": "dev_000243", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Quartz-8B achieved 0.637 accuracy on CedarQA-2 for Project Saffron on 2026-05-27.", "evidence": [], "id": "dev_000244", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata had a failed run with Vela-7B on Node Fir-10 because of an unstable-validation-loss error on 2026-06-12.", "evidence": [{"doc_id": "doc_000353", "sent_id": 8}], "id": "dev_000245", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Nereid had a failed run with Orchid-3B on Node Laurel-11 because of an out-of-memory error on 2026-04-03.", "evidence": [{"doc_id": "doc_000120", "sent_id": 8}], "id": "dev_000246", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "Project Anchor moved the Milestone T1 deadline from 2026-04-19 to 2026-04-25 on 2026-04-08.", "evidence": [], "id": "dev_000247", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Node Sycamore-13 allocated 2 GPUs to Project Saffron on 2026-06-21.", "evidence": [], "id": "dev_000248", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Nereid changed its method from query rewriting to sentence pruning on 2026-05-26.", "evidence": [], "id": "dev_000249", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Anchor's run with Orchid-3B failed on Node Cedar-02 because of a checkpoint-mismatch error on 2026-05-28.", "evidence": [{"doc_id": "doc_000353", "sent_id": 3}], "id": "dev_000250", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "In the compared latency efficiency score runs, Quartz-3B achieved a higher latency efficiency score than River-3B.", "evidence": [{"doc_id": "doc_000223", "sent_id": 7}, {"doc_id": "doc_000055", "sent_id": 2}], "id": "dev_000251", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "The project where Iris Stone was assigned as the data steward on 2026-05-05 selected River-3B for evidence retrieval on 2026-05-21.", "evidence": [{"doc_id": "doc_000004", "sent_id": 9}, {"doc_id": "doc_000256", "sent_id": 3}], "id": "dev_000252", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Sonata had a failed run with Vela-8B on Node Fir-10 because of an out-of-memory error on 2026-06-11.", "evidence": [{"doc_id": "doc_000121", "sent_id": 3}], "id": "dev_000253", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid selected Cedar-8B for error analysis on 2026-04-05.", "evidence": [{"doc_id": "doc_000335", "sent_id": 3}], "id": "dev_000254", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "The review of Vector Lantern Study 4 reported on 2026-04-27 that it used confidence calibration and used a reward model.", "evidence": [{"doc_id": "doc_000358", "sent_id": 2}], "id": "dev_000255", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Marble-8B for error analysis on 2026-05-04.", "evidence": [], "id": "dev_000256", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Kestrel-7B failed because of a checkpoint-mismatch error on 2026-04-23 while using Node Birch-04.", "evidence": [{"doc_id": "doc_000423", "sent_id": 3}], "id": "dev_000257", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Mina Torres was assigned as the data steward for Project Meridian on 2026-06-23.", "evidence": [{"doc_id": "doc_000204", "sent_id": 7}], "id": "dev_000258", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "River-8B achieved 0.869 accuracy on SignalSet-2 for Project Anchor on 2026-04-08.", "evidence": [], "id": "dev_000259", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Node Laurel-11 allocated 2 GPUs to Project Meridian on 2026-05-23.", "evidence": [{"doc_id": "doc_000009", "sent_id": 6}], "id": "dev_000260", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Birch-04 allocated 3 GPUs to Project Nereid on 2026-04-08.", "evidence": [], "id": "dev_000261", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Kestrel-7B achieved 0.717 latency efficiency score on CedarQA for Project Meridian on 2026-04-14.", "evidence": [{"doc_id": "doc_000335", "sent_id": 6}], "id": "dev_000262", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Helix-7B achieved 0.853 macro F1 on OrionBench-2 for Project Aster on 2026-04-27.", "evidence": [{"doc_id": "doc_000147", "sent_id": 2}], "id": "dev_000263", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Dr. Julian Gray was assigned as the lead for Project Nereid on 2026-04-29.", "evidence": [], "id": "dev_000264", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Selene Kim was assigned as the evaluation owner on 2026-06-09 selected Lumen-8B for calibration on 2026-06-18.", "evidence": [{"doc_id": "doc_000388", "sent_id": 7}, {"doc_id": "doc_000306", "sent_id": 4}], "id": "dev_000265", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "River-3B achieved 0.910 latency efficiency score on SignalSet-2 for Project Anchor on 2026-06-30.", "evidence": [{"doc_id": "doc_000120", "sent_id": 7}], "id": "dev_000266", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Nimbus-3B achieved 0.723 accuracy on MemoTrace for Project Meridian on 2026-06-09.", "evidence": [{"doc_id": "doc_000353", "sent_id": 7}], "id": "dev_000267", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Aster selected Atlas-7B for reranking on 2026-04-08.", "evidence": [{"doc_id": "doc_000209", "sent_id": 5}], "id": "dev_000268", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared accuracy runs, Finch-7B achieved a higher accuracy than Aster-7B.", "evidence": [{"doc_id": "doc_000041", "sent_id": 2}, {"doc_id": "doc_000007", "sent_id": 7}], "id": "dev_000269", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Elm-08 allocated 6 GPUs to Project Meridian on 2026-06-06.", "evidence": [{"doc_id": "doc_000079", "sent_id": 6}], "id": "dev_000270", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Marble-7B achieved 0.601 accuracy on VestaLogs-3 for Project Saffron on 2026-06-23.", "evidence": [{"doc_id": "doc_000201", "sent_id": 7}], "id": "dev_000271", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Meridian selected Atlas-8B for evidence retrieval on 2026-05-03.", "evidence": [{"doc_id": "doc_000385", "sent_id": 4}], "id": "dev_000272", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "conflict_resolution"}}
{"claim": "The project where Dr. Jonas Nolan was assigned as the retrieval owner on 2026-05-26 changed its method from metric smoothing to document chunking on 2026-05-10.", "evidence": [{"doc_id": "doc_000138", "sent_id": 7}, {"doc_id": "doc_000426", "sent_id": 4}], "id": "dev_000273", "label": "REFUTED", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Saffron changed its method from document chunking to hybrid retrieval on 2026-04-05.", "evidence": [{"doc_id": "doc_000209", "sent_id": 4}], "id": "dev_000274", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The project where Kira Frost was assigned as the data steward on 2026-06-15 changed its method from QLoRA adaptation to structured prompting on 2026-06-10.", "evidence": [{"doc_id": "doc_000193", "sent_id": 2}, {"doc_id": "doc_000007", "sent_id": 5}], "id": "dev_000275", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Project Nereid's run with Cedar-8B failed because of an out-of-memory error on 2026-04-23 while using Node Laurel-11.", "evidence": [{"doc_id": "doc_000397", "sent_id": 3}], "id": "dev_000276", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from contrastive tuning to data mixing on 2026-06-14.", "evidence": [{"doc_id": "doc_000204", "sent_id": 4}], "id": "dev_000277", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Talia Marin was assigned as the lead for Project Aster on 2026-06-17.", "evidence": [], "id": "dev_000278", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Node Spruce-03 allocated 5 GPUs to Project Aster on 2026-06-20.", "evidence": [{"doc_id": "doc_000204", "sent_id": 6}], "id": "dev_000279", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Node Spruce-03 allocated 2 GPUs to Project Anchor on 2026-05-09.", "evidence": [{"doc_id": "doc_000385", "sent_id": 6}], "id": "dev_000280", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "alias_resolution"}}
{"claim": "Nova-3B achieved 0.795 accuracy on LumenFacts-2 for Project Meridian on 2026-04-15.", "evidence": [], "id": "dev_000281", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "multi_doc"}}
{"claim": "Lumen-7B achieved 0.803 accuracy on MemoTrace-3 for Project Anchor on 2026-05-11.", "evidence": [{"doc_id": "doc_000009", "sent_id": 2}], "id": "dev_000282", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor's run with Orchid-8B failed on Node Cedar-02 because of a missing-index error on 2026-05-29.", "evidence": [{"doc_id": "doc_000009", "sent_id": 8}], "id": "dev_000283", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Aster's run with Finch-3B failed on Node Juniper-06 because of an out-of-memory error on 2026-06-12.", "evidence": [{"doc_id": "doc_000493", "sent_id": 8}], "id": "dev_000284", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "The review of Pale Compass Study 2 reported on 2026-05-01 that it used threshold search and did not use a reward model.", "evidence": [{"doc_id": "doc_000371", "sent_id": 8}], "id": "dev_000285", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Nova-8B achieved a higher evidence F1 than Cedar-8B.", "evidence": [{"doc_id": "doc_000213", "sent_id": 7}, {"doc_id": "doc_000387", "sent_id": 2}], "id": "dev_000286", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Hazel-14 allocated 4 GPUs to Project Meridian on 2026-05-02.", "evidence": [{"doc_id": "doc_000296", "sent_id": 6}], "id": "dev_000287", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "In the compared latency efficiency score runs, Kestrel-8B achieved a higher latency efficiency score than Nimbus-8B.", "evidence": [{"doc_id": "doc_000264", "sent_id": 7}, {"doc_id": "doc_000228", "sent_id": 2}], "id": "dev_000288", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Node Sycamore-13 allocated 3 GPUs to Project Sonata on 2026-05-23.", "evidence": [{"doc_id": "doc_000035", "sent_id": 6}], "id": "dev_000289", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor changed its method from temporal filtering to alias expansion on 2026-05-08.", "evidence": [], "id": "dev_000290", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
{"claim": "Project Anchor selected Atlas-7B for reranking on 2026-05-24.", "evidence": [{"doc_id": "doc_000471", "sent_id": 4}], "id": "dev_000291", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Sonata had a failed run with Marble-3B on Node Maple-01 because of an out-of-memory error on 2026-05-08.", "evidence": [{"doc_id": "doc_000022", "sent_id": 8}], "id": "dev_000292", "label": "SUPPORTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "In the compared evidence F1 runs, Nimbus-8B achieved a higher evidence F1 than Kestrel-8B.", "evidence": [{"doc_id": "doc_000303", "sent_id": 2}, {"doc_id": "doc_000363", "sent_id": 7}], "id": "dev_000293", "label": "REFUTED", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Meridian changed its method from query rewriting to sentence pruning on 2026-05-29.", "evidence": [], "id": "dev_000294", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "hard", "reasoning_type": "multi_doc"}}
{"claim": "Project Meridian's run with Nimbus-7B failed because of a checkpoint-mismatch error on 2026-06-11 while using Node Elm-08.", "evidence": [{"doc_id": "doc_000201", "sent_id": 3}], "id": "dev_000295", "label": "SUPPORTED", "metadata": {"difficulty": "medium", "reasoning_type": "single_doc"}}
{"claim": "Project Nereid changed its method from sentence pruning to query rewriting on 2026-05-27.", "evidence": [{"doc_id": "doc_000471", "sent_id": 5}], "id": "dev_000296", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Saffron selected Atlas-7B for reranking on 2026-04-26.", "evidence": [{"doc_id": "doc_000296", "sent_id": 4}], "id": "dev_000297", "label": "REFUTED", "metadata": {"difficulty": "easy", "reasoning_type": "single_doc"}}
{"claim": "Project Anchor moved the Milestone B1 deadline from 2026-05-17 to 2026-05-21 on 2026-05-02.", "evidence": [], "id": "dev_000298", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "numeric_or_comparison"}}
{"claim": "Project Aster selected Orchid-8B for reranking on 2026-04-12.", "evidence": [{"doc_id": "doc_000302", "sent_id": 4}], "id": "dev_000299", "label": "SUPPORTED", "metadata": {"difficulty": "hard", "reasoning_type": "conflict_resolution"}}
{"claim": "Mara Quinn was assigned as the lead for Project Sonata on 2026-05-06.", "evidence": [], "id": "dev_000300", "label": "NOT_ENOUGH_INFO", "metadata": {"difficulty": "medium", "reasoning_type": "temporal"}}
