["index", "text_input", "text_output_gt", "text_gen", "image_path"] [1, [["", "is it a guy?"]], "no.", "no", "coco/train2014/COCO_train2014_000000579680.jpg"] [2, [["", "is it a guy?"], ["No.", "is it a racket?"]], "yes.", "yes", "coco/train2014/COCO_train2014_000000579680.jpg"] [3, [["", "is it a guy?"], ["No.", "is it a racket?"], ["Yes.", "is it on the right?"]], "no.", "no", "coco/train2014/COCO_train2014_000000579680.jpg"]