diff --git a/DATASET_PACSUM/Copy_of_Data_Creation_and_Preprocessing.ipynb b/DATASET_PACSUM/Copy_of_Data_Creation_and_Preprocessing.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e6f9655b097523ac6224e15b698116991e4b9423 --- /dev/null +++ b/DATASET_PACSUM/Copy_of_Data_Creation_and_Preprocessing.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":8,"metadata":{"id":"_7Dv06u3wCgF","executionInfo":{"status":"ok","timestamp":1719055848840,"user_tz":-240,"elapsed":542,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[],"source":["import xml.etree.ElementTree as ET"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"nljzR6jIRo93"},"outputs":[],"source":["#get all xmls in a dict"]},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wYUxD2tYP4g7","outputId":"e0a1be34-990a-46e1-8cb1-0acc00afb329","executionInfo":{"status":"ok","timestamp":1719055855304,"user_tz":-240,"elapsed":3805,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":14,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W419k_HMz4in","outputId":"099d951d-5bf3-49f4-c6c0-512fdcb41e1c","executionInfo":{"status":"ok","timestamp":1719058193140,"user_tz":-240,"elapsed":183213,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Going through A00-1031.xml index= 0\n","Going through A00-1043.xml index= 0\n","Going through A00-2004.xml index= 0\n","Going through A00-2009.xml index= 0\n","Going through A00-2018.xml index= 0\n","Going through A00-2019.xml index= 0\n","Going through A00-2024.xml index= 0\n","Going through A00-2026.xml index= 0\n","Going through A00-2030.xml index= 0\n","Going through A00-2031.xml index= 0\n","Going through A00-2034.xml index= 0\n","Going through A88-1019.xml index= 0\n","Going through A92-1006.xml index= 0\n","Going through A92-1018.xml index= 0\n","Going through A92-1021.xml index= 0\n","Going through A94-1006.xml index= 0\n","Going through A94-1009.xml index= 0\n","Going through A94-1016.xml index= 0\n","Going through A97-1004.xml index= 0\n","Going through A97-1011.xml index= 0\n","Going through A97-1014.xml index= 0\n","Going through A97-1029.xml index= 0\n","Going through A97-1030.xml index= 0\n","Going through A97-1039.xml index= 0\n","Going through A97-1052.xml index= 0\n","Going through C00-1007.xml index= 0\n","Going through C00-1044.xml index= 0\n","Going through C00-1072.xml index= 0\n","Going through C00-2136.xml index= 0\n","Going through C00-2137.xml index= 0\n","Going through C00-2163.xml index= 0\n","Going through C02-1011.xml index= 0\n","Going through C02-1054.xml index= 0\n","Going through C02-1114.xml index= 0\n","Going through C02-1139.xml index= 0\n","Going through C02-1144.xml index= 0\n","Going through C02-1145.xml index= 0\n","Going through C02-1150.xml index= 0\n","Going through C02-2025.xml index= 0\n","Going through C04-1010.xml index= 0\n","Going through C04-1024.xml index= 0\n","Going through C04-1041.xml index= 0\n","Going through C04-1046.xml index= 0\n","Going through C04-1051.xml index= 0\n","Going through C04-1059.xml index= 0\n","Going through C04-1072.xml index= 0\n","Going through C04-1073.xml index= 0\n","Going through C04-1080.xml index= 0\n","Going through C04-1081.xml index= 0\n","Going through C04-1100.xml index= 0\n","Going through C04-1111.xml index= 0\n","Going through C04-1146.xml index= 0\n","Going through C04-1180.xml index= 0\n","Going through C04-1197.xml index= 0\n","Going through C04-1200.xml index= 0\n","Going through C08-1018.xml index= 0\n","Going through C08-1022.xml index= 0\n","Going through C08-1098.xml index= 0\n","Going through C08-1107.xml index= 0\n","Going through C08-1109.xml index= 0\n","Going through C08-1114.xml index= 0\n","Going through C10-1011.xml index= 0\n","Going through C10-1152.xml index= 0\n","Going through C10-2005.xml index= 0\n","Going through C10-2028.xml index= 0\n","Going through C86-1016.xml index= 0\n","Going through C86-1045.xml index= 0\n","Going through C88-1016.xml index= 0\n","Going through C88-2121.xml index= 0\n","Going through C88-2128.xml index= 0\n","Going through C88-2147.xml index= 0\n","Going through C90-2067.xml index= 0\n","Going through C90-3030.xml index= 0\n","Going through C90-3044.xml index= 0\n","Going through C90-3045.xml index= 0\n","Going through C90-3052.xml index= 0\n","Going through C90-3063.xml index= 0\n","Going through C92-1019.xml index= 0\n","Going through C92-1025.xml index= 0\n","Going through C92-1038.xml index= 0\n","Going through C92-2066.xml index= 0\n","Going through C92-2070.xml index= 0\n","Going through C92-2082.xml index= 0\n","Going through C92-3126.xml index= 0\n","Going through C92-3150.xml index= 0\n","Going through C94-1027.xml index= 0\n","Going through C94-1032.xml index= 0\n","Going through C94-1042.xml index= 0\n","Going through C94-1079.xml index= 0\n","Going through C94-2174.xml index= 0\n","Going through C94-2178.xml index= 0\n","Going through C94-2195.xml index= 0\n","Going through C96-1005.xml index= 0\n","Going through C96-1021.xml index= 0\n","Going through C96-1055.xml index= 0\n","Going through C96-1058.xml index= 0\n","Going through C96-1079.xml index= 0\n","Going through C96-2141.xml index= 0\n","Going through C96-2183.xml index= 0\n","Going through D07-1002.xml index= 0\n","Going through D07-1003.xml index= 0\n","Going through D07-1007.xml index= 0\n","Going through D07-1031.xml index= 0\n","Going through D07-1043.xml index= 0\n","Going through D07-1061.xml index= 0\n","Going through D07-1071.xml index= 0\n","Going through D07-1072.xml index= 0\n","Going through D07-1074.xml index= 0\n","Going through D07-1076.xml index= 0\n","Going through D07-1077.xml index= 0\n","Going through D07-1080.xml index= 0\n","Going through D07-1090.xml index= 0\n","Going through D07-1091.xml index= 0\n","Going through D07-1096.xml index= 0\n","Going through D07-1097.xml index= 0\n","Going through D07-1101.xml index= 0\n","Going through D07-1103.xml index= 0\n","Going through D07-1104.xml index= 0\n","Going through D07-1109.xml index= 0\n","Going through D07-1111.xml index= 0\n","Going through D07-1114.xml index= 0\n","Going through D08-1011.xml index= 0\n","Going through D08-1014.xml index= 0\n","Going through D08-1016.xml index= 0\n","Going through D08-1020.xml index= 0\n","Going through D08-1021.xml index= 0\n","Going through D08-1022.xml index= 0\n","Going through D08-1024.xml index= 0\n","Going through D08-1027.xml index= 0\n","Going through D08-1031.xml index= 0\n","Going through D08-1035.xml index= 0\n","Going through D08-1036.xml index= 0\n","Going through D08-1059.xml index= 0\n","Going through D08-1065.xml index= 0\n","Going through D08-1068.xml index= 0\n","Going through D08-1076.xml index= 0\n","Going through D08-1082.xml index= 0\n","Going through D08-1083.xml index= 0\n","Going through D08-1089.xml index= 0\n","Going through D08-1092.xml index= 0\n","Going through D09-1001.xml index= 0\n","Going through D09-1005.xml index= 0\n","Going through D09-1026.xml index= 0\n","Going through D09-1030.xml index= 0\n","Going through D09-1058.xml index= 0\n","Going through D09-1086.xml index= 0\n","Going through D09-1092.xml index= 0\n","Going through D09-1098.xml index= 0\n","Going through D09-1101.xml index= 0\n","Going through D09-1120.xml index= 0\n","Going through D09-1127.xml index= 0\n","Going through D09-1159.xml index= 0\n","Going through D10-1001.xml index= 0\n","Going through D10-1044.xml index= 0\n","Going through D10-1048.xml index= 0\n","Going through D10-1115.xml index= 0\n","Going through D10-1119.xml index= 0\n","Going through D10-1120.xml index= 0\n","Going through D10-1124.xml index= 0\n","Going through D10-1125.xml index= 0\n","Going through D11-1006.xml index= 0\n","Going through D11-1014.xml index= 0\n","Going through D11-1033.xml index= 0\n","Going through D11-1062.xml index= 0\n","Going through D11-1125.xml index= 0\n","Going through D11-1129.xml index= 0\n","Going through D11-1141.xml index= 0\n","Going through D11-1142.xml index= 0\n","Going through D12-1050.xml index= 0\n","Going through D12-1133.xml index= 0\n","Going through E03-1005.xml index= 0\n","Going through E03-1008.xml index= 0\n","Going through E03-1009.xml index= 0\n","Going through E03-1071.xml index= 0\n","Going through E03-1076.xml index= 0\n","Going through E06-1002.xml index= 0\n","Going through E06-1005.xml index= 0\n","Going through E06-1011.xml index= 0\n","Going through E06-1015.xml index= 0\n","Going through E06-1025.xml index= 0\n","Going through E06-1027.xml index= 0\n","Going through E06-1031.xml index= 0\n","Going through E06-1032.xml index= 0\n","Going through E06-1038.xml index= 0\n","Going through E06-1040.xml index= 0\n","Going through E06-1042.xml index= 0\n","Going through E06-1043.xml index= 0\n","Going through E06-1051.xml index= 0\n","Going through E09-1005.xml index= 0\n","Going through E09-1013.xml index= 0\n","Going through E87-1002.xml index= 0\n","Going through E89-1009.xml index= 0\n","Going through E89-1037.xml index= 0\n","Going through E99-1001.xml index= 0\n","Going through E99-1010.xml index= 0\n","Going through E99-1023.xml index= 0\n","Going through H01-1035.xml index= 0\n","Going through H05-1004.xml index= 0\n","Going through H05-1010.xml index= 0\n","Going through H05-1011.xml index= 0\n","Going through H05-1012.xml index= 0\n","Going through H05-1021.xml index= 0\n","Going through H05-1043.xml index= 0\n","Going through H05-1044.xml index= 0\n","Going through H05-1045.xml index= 0\n","Going through H05-1053.xml index= 0\n","Going through H05-1059.xml index= 0\n","Going through H05-1066.xml index= 0\n","Going through H05-1073.xml index= 0\n","Going through H05-1079.xml index= 0\n","Going through H05-1091.xml index= 0\n","Going through H05-2018.xml index= 0\n","Going through H91-1026.xml index= 0\n","Going through H91-1060.xml index= 0\n","Going through H92-1026.xml index= 0\n","Going through H92-1045.xml index= 0\n","Going through H93-1051.xml index= 0\n","Going through H93-1052.xml index= 0\n","Going through H93-1061.xml index= 0\n","Going through H94-1020.xml index= 0\n","Going through H94-1046.xml index= 0\n","Going through H94-1048.xml index= 0\n","Going through I05-2038.xml index= 0\n","Going through I05-3017.xml index= 0\n","Going through I05-3025.xml index= 0\n","Going through I05-3027.xml index= 0\n","Going through I08-1059.xml index= 0\n","Going through J00-1004.xml index= 0\n","Going through J00-2004.xml index= 0\n","Going through J00-3003.xml index= 0\n","Going through J00-3004.xml index= 0\n","Going through J00-4003.xml index= 0\n","Going through J00-4005.xml index= 0\n","Going through J01-2001.xml index= 0\n","Going through J01-2002.xml index= 0\n","Going through J01-2004.xml index= 0\n","Going through J01-3001.xml index= 0\n","Going through J01-3003.xml index= 0\n","Going through J01-4004.xml index= 0\n","Going through J02-1002.xml index= 0\n","Going through J02-1003.xml index= 0\n","Going through J02-2003.xml index= 0\n","Going through J02-3001.xml index= 0\n","Going through J02-4002.xml index= 0\n","Going through J03-1002.xml index= 0\n","Going through J03-1003.xml index= 0\n","Going through J03-1005.xml index= 0\n","Going through J03-3001.xml index= 0\n","Going through J03-3002.xml index= 0\n","Going through J03-3005.xml index= 0\n","Going through J03-4003.xml index= 0\n","Going through J03-4004.xml index= 0\n","Going through J04-1002.xml index= 0\n","Going through J04-1005.xml index= 0\n","Going through J04-2003.xml index= 0\n","Going through J04-3002.xml index= 0\n","Going through J04-4002.xml index= 0\n","Going through J04-4004.xml index= 0\n","Going through J05-1003.xml index= 0\n","Going through J05-1004.xml index= 0\n","Going through J05-3002.xml index= 0\n","Going through J05-4003.xml index= 0\n","Going through J06-1003.xml index= 0\n","Going through J06-3003.xml index= 0\n","Going through J07-2003.xml index= 0\n","Going through J07-3004.xml index= 0\n","Going through J07-4004.xml index= 0\n","Going through J08-1001.xml index= 0\n","Going through J08-1002.xml index= 0\n","Going through J08-2002.xml index= 0\n","Going through J08-2005.xml index= 0\n","Going through J08-4003.xml index= 0\n","Going through J08-4004.xml index= 0\n","Going through J09-3003.xml index= 0\n","Going through J10-3003.xml index= 0\n","Going through J10-4006.xml index= 0\n","Going through J80-3003.xml index= 0\n","Going through J81-4003.xml index= 0\n","Going through J82-3004.xml index= 0\n","Going through J86-3001.xml index= 0\n","Going through J87-1004.xml index= 0\n","Going through J87-1005.xml index= 0\n","Going through J88-1003.xml index= 0\n","Going through J88-2003.xml index= 0\n","Going through J88-2006.xml index= 0\n","Going through J90-1003.xml index= 0\n","Going through J90-1004.xml index= 0\n","Going through J90-2002.xml index= 0\n","Going through J91-1002.xml index= 0\n","Going through J91-1003.xml index= 0\n","Going through J91-4003.xml index= 0\n","Going through J92-1001.xml index= 0\n","Going through J92-1004.xml index= 0\n","Going through J92-4003.xml index= 0\n","Going through J92-4007.xml index= 0\n","Going through J93-1001.xml index= 0\n","Going through J93-1002.xml index= 0\n","Going through J93-1003.xml index= 0\n","Going through J93-1004.xml index= 0\n","Going through J93-1005.xml index= 0\n","Going through J93-1006.xml index= 0\n","Going through J93-1007.xml index= 0\n","Going through J93-2002.xml index= 0\n","Going through J93-2003.xml index= 0\n","Going through J93-2004.xml index= 0\n","Going through J93-2005.xml index= 0\n","Going through J93-2006.xml index= 0\n","Going through J93-3003.xml index= 0\n","Going through J94-2001.xml index= 0\n","Going through J94-2003.xml index= 0\n","Going through J94-3001.xml index= 0\n","Going through J94-4001.xml index= 0\n","Going through J94-4002.xml index= 0\n","Going through J94-4003.xml index= 0\n","Going through J94-4004.xml index= 0\n","Going through J95-2002.xml index= 0\n","Going through J95-2003.xml index= 0\n","Going through J95-4004.xml index= 0\n","Going through J96-1001.xml index= 0\n","Going through J96-1002.xml index= 0\n","Going through J96-2004.xml index= 0\n","Going through J96-3004.xml index= 0\n","Going through J97-1002.xml index= 0\n","Going through J97-1003.xml index= 0\n","Going through J97-1005.xml index= 0\n","Going through J97-2003.xml index= 0\n","Going through J97-3002.xml index= 0\n","Going through J97-3003.xml index= 0\n","Going through J97-4005.xml index= 0\n","Going through J98-1001.xml index= 0\n","Going through J98-1006.xml index= 0\n","Going through J98-2001.xml index= 0\n","Going through J98-2002.xml index= 0\n","Going through J98-2004.xml index= 0\n","Going through J98-3005.xml index= 0\n","Going through J98-4003.xml index= 0\n","Going through J98-4004.xml index= 0\n","Going through J99-1003.xml index= 0\n","Going through J99-2004.xml index= 0\n","Going through J99-3001.xml index= 0\n","Going through J99-4004.xml index= 0\n","Going through J99-4005.xml index= 0\n","Going through L08-1093.xml index= 0\n","Going through M95-1005.xml index= 0\n","Going through M95-1012.xml index= 0\n","Going through N01-1006.xml index= 0\n","Going through N01-1008.xml index= 0\n","Going through N01-1011.xml index= 0\n","Going through N01-1016.xml index= 0\n","Going through N01-1020.xml index= 0\n","Going through N01-1021.xml index= 0\n","Going through N01-1023.xml index= 0\n","Going through N01-1024.xml index= 0\n","Going through N01-1025.xml index= 0\n","Going through N01-1026.xml index= 0\n","Going through N03-1003.xml index= 0\n","Going through N03-1014.xml index= 0\n","Going through N03-1016.xml index= 0\n","Going through N03-1017.xml index= 0\n","Going through N03-1020.xml index= 0\n","Going through N03-1021.xml index= 0\n","Going through N03-1022.xml index= 0\n","Going through N03-1024.xml index= 0\n","Going through N03-1026.xml index= 0\n","Going through N03-1028.xml index= 0\n","Going through N03-1030.xml index= 0\n","Going through N03-1033.xml index= 0\n","Going through N03-2002.xml index= 0\n","Going through N03-2021.xml index= 0\n","Going through N04-1001.xml index= 0\n","Going through N04-1013.xml index= 0\n","Going through N04-1014.xml index= 0\n","Going through N04-1015.xml index= 0\n","Going through N04-1016.xml index= 0\n","Going through N04-1019.xml index= 0\n","Going through N04-1021.xml index= 0\n","Going through N04-1022.xml index= 0\n","Going through N04-1023.xml index= 0\n","Going through N04-1025.xml index= 0\n","Going through N04-1030.xml index= 0\n","Going through N04-1033.xml index= 0\n","Going through N04-1035.xml index= 0\n","Going through N04-1041.xml index= 0\n","Going through N04-1042.xml index= 0\n","Going through N04-1043.xml index= 0\n","Going through N04-3012.xml index= 0\n","Going through N04-4015.xml index= 0\n","Going through N04-4026.xml index= 0\n","Going through N04-4038.xml index= 0\n","Going through N06-1003.xml index= 0\n","Going through N06-1006.xml index= 0\n","Going through N06-1011.xml index= 0\n","Going through N06-1014.xml index= 0\n","Going through N06-1020.xml index= 0\n","Going through N06-1025.xml index= 0\n","Going through N06-1033.xml index= 0\n","Going through N06-1039.xml index= 0\n","Going through N06-1041.xml index= 0\n","Going through N06-1056.xml index= 0\n","Going through N06-1058.xml index= 0\n","Going through N06-2013.xml index= 0\n","Going through N06-2015.xml index= 0\n","Going through N06-2033.xml index= 0\n","Going through N07-1011.xml index= 0\n","Going through N07-1018.xml index= 0\n","Going through N07-1023.xml index= 0\n","Going through N07-1029.xml index= 0\n","Going through N07-1030.xml index= 0\n","Going through N07-1038.xml index= 0\n","Going through N07-1047.xml index= 0\n","Going through N07-1051.xml index= 0\n","Going through N07-1071.xml index= 0\n","Going through N07-4013.xml index= 0\n","Going through N09-1003.xml index= 0\n","Going through N09-1009.xml index= 0\n","Going through N09-1012.xml index= 0\n","Going through N09-1025.xml index= 0\n","Going through N09-1028.xml index= 0\n","Going through N09-1036.xml index= 0\n","Going through N09-1037.xml index= 0\n","Going through N09-1041.xml index= 0\n","Going through N09-1046.xml index= 0\n","Going through N09-2004.xml index= 0\n","Going through N10-1013.xml index= 0\n","Going through N10-1019.xml index= 0\n","Going through N10-1020.xml index= 0\n","Going through N10-1056.xml index= 0\n","Going through N10-1061.xml index= 0\n","Going through N10-1063.xml index= 0\n","Going through N10-1115.xml index= 0\n","Going through N10-1119.xml index= 0\n","Going through N12-1047.xml index= 0\n","Going through N12-1052.xml index= 0\n","Going through N12-1067.xml index= 0\n","Going through N13-1039.xml index= 0\n","Going through N13-1090.xml index= 0\n","Going through P00-1010.xml index= 0\n","Going through P00-1016.xml index= 0\n","Going through P00-1027.xml index= 0\n","Going through P00-1037.xml index= 0\n","Going through P00-1041.xml index= 0\n","Going through P00-1056.xml index= 0\n","Going through P00-1058.xml index= 0\n","Going through P00-1065.xml index= 0\n","Going through P00-1071.xml index= 0\n","Going through P01-1005.xml index= 0\n","Going through P01-1008.xml index= 0\n","Going through P01-1017.xml index= 0\n","Going through P01-1019.xml index= 0\n","Going through P01-1025.xml index= 0\n","Going through P01-1030.xml index= 0\n","Going through P01-1064.xml index= 0\n","Going through P01-1067.xml index= 0\n","Going through P02-1001.xml index= 0\n","Going through P02-1006.xml index= 0\n","Going through P02-1014.xml index= 0\n","Going through P02-1017.xml index= 0\n","Going through P02-1018.xml index= 0\n","Going through P02-1019.xml index= 0\n","Going through P02-1022.xml index= 0\n","Going through P02-1031.xml index= 0\n","Going through P02-1033.xml index= 0\n","Going through P02-1034.xml index= 0\n","Going through P02-1035.xml index= 0\n","Going through P02-1038.xml index= 0\n","Going through P02-1039.xml index= 0\n","Going through P02-1040.xml index= 0\n","Going through P02-1042.xml index= 0\n","Going through P02-1043.xml index= 0\n","Going through P02-1046.xml index= 0\n","Going through P02-1047.xml index= 0\n","Going through P02-1050.xml index= 0\n","Going through P02-1051.xml index= 0\n","Going through P02-1053.xml index= 0\n","Going through P02-1060.xml index= 0\n","Going through P02-1062.xml index= 0\n","Going through P03-1001.xml index= 0\n","Going through P03-1002.xml index= 0\n","Going through P03-1003.xml index= 0\n","Going through P03-1004.xml index= 0\n","Going through P03-1009.xml index= 0\n","Going through P03-1010.xml index= 0\n","Going through P03-1011.xml index= 0\n","Going through P03-1012.xml index= 0\n","Going through P03-1013.xml index= 0\n","Going through P03-1019.xml index= 0\n","Going through P03-1021.xml index= 0\n","Going through P03-1022.xml index= 0\n","Going through P03-1023.xml index= 0\n","Going through P03-1029.xml index= 0\n","Going through P03-1035.xml index= 0\n","Going through P03-1044.xml index= 0\n","Going through P03-1051.xml index= 0\n","Going through P03-1054.xml index= 0\n","Going through P03-1056.xml index= 0\n","Going through P03-1058.xml index= 0\n","Going through P03-1069.xml index= 0\n","Going through P03-1071.xml index= 0\n","Going through P03-2026.xml index= 0\n","Going through P03-2041.xml index= 0\n","Going through P04-1005.xml index= 0\n","Going through P04-1013.xml index= 0\n","Going through P04-1014.xml index= 0\n","Going through P04-1015.xml index= 0\n","Going through P04-1018.xml index= 0\n","Going through P04-1021.xml index= 0\n","Going through P04-1035.xml index= 0\n","Going through P04-1036.xml index= 0\n","Going through P04-1041.xml index= 0\n","Going through P04-1043.xml index= 0\n","Going through P04-1053.xml index= 0\n","Going through P04-1054.xml index= 0\n","Going through P04-1056.xml index= 0\n","Going through P04-1061.xml index= 0\n","Going through P04-1066.xml index= 0\n","Going through P04-1075.xml index= 0\n","Going through P04-1077.xml index= 0\n","Going through P04-1083.xml index= 0\n","Going through P04-1085.xml index= 0\n","Going through P04-3022.xml index= 0\n","Going through P05-1001.xml index= 0\n","Going through P05-1010.xml index= 0\n","Going through P05-1011.xml index= 0\n","Going through P05-1012.xml index= 0\n","Going through P05-1013.xml index= 0\n","Going through P05-1015.xml index= 0\n","Going through P05-1017.xml index= 0\n","Going through P05-1018.xml index= 0\n","Going through P05-1020.xml index= 0\n","Going through P05-1022.xml index= 0\n","Going through P05-1033.xml index= 0\n","Going through P05-1034.xml index= 0\n","Going through P05-1036.xml index= 0\n","Going through P05-1044.xml index= 0\n","Going through P05-1045.xml index= 0\n","Going through P05-1047.xml index= 0\n","Going through P05-1052.xml index= 0\n","Going through P05-1053.xml index= 0\n","Going through P05-1057.xml index= 0\n","Going through P05-1059.xml index= 0\n","Going through P05-1065.xml index= 0\n","Going through P05-1066.xml index= 0\n","Going through P05-1067.xml index= 0\n","Going through P05-1071.xml index= 0\n","Going through P05-1072.xml index= 0\n","Going through P05-1073.xml index= 0\n","Going through P05-1074.xml index= 0\n","Going through P05-1077.xml index= 0\n","Going through P05-2008.xml index= 0\n","Going through P05-3026.xml index= 0\n","Going through P06-1004.xml index= 0\n","Going through P06-1005.xml index= 0\n","Going through P06-1009.xml index= 0\n","Going through P06-1010.xml index= 0\n","Going through P06-1011.xml index= 0\n","Going through P06-1014.xml index= 0\n","Going through P06-1015.xml index= 0\n","Going through P06-1032.xml index= 0\n","Going through P06-1038.xml index= 0\n","Going through P06-1043.xml index= 0\n","Going through P06-1055.xml index= 0\n","Going through P06-1066.xml index= 0\n","Going through P06-1067.xml index= 0\n","Going through P06-1072.xml index= 0\n","Going through P06-1077.xml index= 0\n","Going through P06-1084.xml index= 0\n","Going through P06-1085.xml index= 0\n","Going through P06-1091.xml index= 0\n","Going through P06-1095.xml index= 0\n","Going through P06-1097.xml index= 0\n","Going through P06-1101.xml index= 0\n","Going through P06-1103.xml index= 0\n","Going through P06-1104.xml index= 0\n","Going through P06-1109.xml index= 0\n","Going through P06-1114.xml index= 0\n","Going through P06-1115.xml index= 0\n","Going through P06-1121.xml index= 0\n","Going through P06-1123.xml index= 0\n","Going through P06-1124.xml index= 0\n","Going through P06-1134.xml index= 0\n","Going through P06-2005.xml index= 0\n","Going through P06-2006.xml index= 0\n","Going through P06-2014.xml index= 0\n","Going through P06-2066.xml index= 0\n","Going through P06-2094.xml index= 0\n","Going through P06-2101.xml index= 0\n","Going through P06-3002.xml index= 0\n","Going through P06-4020.xml index= 0\n","Going through P07-1003.xml index= 0\n","Going through P07-1004.xml index= 0\n","Going through P07-1005.xml index= 0\n","Going through P07-1007.xml index= 0\n","Going through P07-1019.xml index= 0\n","Going through P07-1028.xml index= 0\n","Going through P07-1030.xml index= 0\n","Going through P07-1031.xml index= 0\n","Going through P07-1032.xml index= 0\n","Going through P07-1034.xml index= 0\n","Going through P07-1036.xml index= 0\n","Going through P07-1037.xml index= 0\n","Going through P07-1040.xml index= 0\n","Going through P07-1049.xml index= 0\n","Going through P07-1055.xml index= 0\n","Going through P07-1056.xml index= 0\n","Going through P07-1059.xml index= 0\n","Going through P07-1065.xml index= 0\n","Going through P07-1073.xml index= 0\n","Going through P07-1091.xml index= 0\n","Going through P07-1092.xml index= 0\n","Going through P07-1094.xml index= 0\n","Going through P07-1096.xml index= 0\n","Going through P07-1098.xml index= 0\n","Going through P07-1106.xml index= 0\n","Going through P07-1107.xml index= 0\n","Going through P07-1121.xml index= 0\n","Going through P07-1123.xml index= 0\n","Going through P07-1125.xml index= 0\n","Going through P07-2045.xml index= 0\n","Going through P08-1004.xml index= 0\n","Going through P08-1012.xml index= 0\n","Going through P08-1023.xml index= 0\n","Going through P08-1024.xml index= 0\n","Going through P08-1028.xml index= 0\n","Going through P08-1030.xml index= 0\n","Going through P08-1036.xml index= 0\n","Going through P08-1043.xml index= 0\n","Going through P08-1064.xml index= 0\n","Going through P08-1066.xml index= 0\n","Going through P08-1067.xml index= 0\n","Going through P08-1068.xml index= 0\n","Going through P08-1076.xml index= 0\n","Going through P08-1084.xml index= 0\n","Going through P08-1085.xml index= 0\n","Going through P08-1086.xml index= 0\n","Going through P08-1088.xml index= 0\n","Going through P08-1090.xml index= 0\n","Going through P08-1101.xml index= 0\n","Going through P08-1102.xml index= 0\n","Going through P08-1108.xml index= 0\n","Going through P08-1109.xml index= 0\n","Going through P08-1114.xml index= 0\n","Going through P08-1115.xml index= 0\n","Going through P08-1119.xml index= 0\n","Going through P08-2007.xml index= 0\n","Going through P08-2012.xml index= 0\n","Going through P08-2026.xml index= 0\n","Going through P09-1010.xml index= 0\n","Going through P09-1011.xml index= 0\n","Going through P09-1019.xml index= 0\n","Going through P09-1026.xml index= 0\n","Going through P09-1027.xml index= 0\n","Going through P09-1039.xml index= 0\n","Going through P09-1040.xml index= 0\n","Going through P09-1042.xml index= 0\n","Going through P09-1057.xml index= 0\n","Going through P09-1058.xml index= 0\n","Going through P09-1068.xml index= 0\n","Going through P09-1074.xml index= 0\n","Going through P09-1077.xml index= 0\n","Going through P09-1088.xml index= 0\n","Going through P09-1094.xml index= 0\n","Going through P09-1104.xml index= 0\n","Going through P09-1113.xml index= 0\n","Going through P09-1116.xml index= 0\n","Going through P09-2004.xml index= 0\n","Going through P09-2012.xml index= 0\n","Going through P10-1001.xml index= 0\n","Going through P10-1040.xml index= 0\n","Going through P10-1044.xml index= 0\n","Going through P10-1052.xml index= 0\n","Going through P10-1110.xml index= 0\n","Going through P10-1142.xml index= 0\n","Going through P10-1146.xml index= 0\n","Going through P10-2041.xml index= 0\n","Going through P10-4002.xml index= 0\n","Going through P11-1016.xml index= 0\n","Going through P11-1019.xml index= 0\n","Going through P11-1020.xml index= 0\n","Going through P11-1038.xml index= 0\n","Going through P11-1055.xml index= 0\n","Going through P11-1060.xml index= 0\n","Going through P11-1061.xml index= 0\n","Going through P11-1098.xml index= 0\n","Going through P11-1138.xml index= 0\n","Going through P11-2008.xml index= 0\n","Going through P11-2031.xml index= 0\n","Going through P11-2033.xml index= 0\n","Going through P12-1092.xml index= 0\n","Going through P13-1045.xml index= 0\n","Going through P83-1007.xml index= 0\n","Going through P83-1019.xml index= 0\n","Going through P83-1020.xml index= 0\n","Going through P83-1021.xml index= 0\n","Going through P84-1008.xml index= 0\n","Going through P84-1018.xml index= 0\n","Going through P84-1075.xml index= 0\n","Going through P84-1085.xml index= 0\n","Going through P85-1008.xml index= 0\n","Going through P85-1011.xml index= 0\n","Going through P85-1018.xml index= 0\n","Going through P86-1004.xml index= 0\n","Going through P86-1031.xml index= 0\n","Going through P87-1015.xml index= 0\n","Going through P87-1022.xml index= 0\n","Going through P87-1033.xml index= 0\n","Going through P88-1012.xml index= 0\n","Going through P88-1015.xml index= 0\n","Going through P88-1020.xml index= 0\n","Going through P89-1002.xml index= 0\n","Going through P89-1009.xml index= 0\n","Going through P89-1010.xml index= 0\n","Going through P89-1031.xml index= 0\n","Going through P90-1005.xml index= 0\n","Going through P90-1010.xml index= 0\n","Going through P90-1032.xml index= 0\n","Going through P90-1034.xml index= 0\n","Going through P91-1017.xml index= 0\n","Going through P91-1022.xml index= 0\n","Going through P91-1023.xml index= 0\n","Going through P91-1027.xml index= 0\n","Going through P91-1030.xml index= 0\n","Going through P91-1034.xml index= 0\n","Going through P92-1005.xml index= 0\n","Going through P92-1008.xml index= 0\n","Going through P92-1017.xml index= 0\n","Going through P92-1032.xml index= 0\n","Going through P93-1001.xml index= 0\n","Going through P93-1002.xml index= 0\n","Going through P93-1003.xml index= 0\n","Going through P93-1005.xml index= 0\n","Going through P93-1008.xml index= 0\n","Going through P93-1016.xml index= 0\n","Going through P93-1020.xml index= 0\n","Going through P93-1022.xml index= 0\n","Going through P93-1023.xml index= 0\n","Going through P93-1024.xml index= 0\n","Going through P93-1032.xml index= 0\n","Going through P93-1035.xml index= 0\n","Going through P93-1041.xml index= 0\n","Going through P94-1002.xml index= 0\n","Going through P94-1012.xml index= 0\n","Going through P94-1013.xml index= 0\n","Going through P94-1019.xml index= 0\n","Going through P94-1020.xml index= 0\n","Going through P95-1007.xml index= 0\n","Going through P95-1021.xml index= 0\n","Going through P95-1026.xml index= 0\n","Going through P95-1034.xml index= 0\n","Going through P95-1037.xml index= 0\n","Going through P95-1050.xml index= 0\n","Going through P96-1006.xml index= 0\n","Going through P96-1008.xml index= 0\n","Going through P96-1011.xml index= 0\n","Going through P96-1021.xml index= 0\n","Going through P96-1024.xml index= 0\n","Going through P96-1025.xml index= 0\n","Going through P96-1027.xml index= 0\n","Going through P96-1038.xml index= 0\n","Going through P96-1041.xml index= 0\n","Going through P96-1042.xml index= 0\n","Going through P97-1003.xml index= 0\n","Going through P97-1005.xml index= 0\n","Going through P97-1009.xml index= 0\n","Going through P97-1013.xml index= 0\n","Going through P97-1017.xml index= 0\n","Going through P97-1023.xml index= 0\n","Going through P97-1035.xml index= 0\n","Going through P97-1041.xml index= 0\n","Going through P97-1063.xml index= 0\n","Going through P98-1010.xml index= 0\n","Going through P98-1012.xml index= 0\n","Going through P98-1013.xml index= 0\n","Going through P98-1029.xml index= 0\n","Going through P98-1034.xml index= 0\n","Going through P98-1035.xml index= 0\n","Going through P98-1046.xml index= 0\n","Going through P98-1069.xml index= 0\n","Going through P98-1081.xml index= 0\n","Going through P98-1106.xml index= 0\n","Going through P98-1112.xml index= 0\n","Going through P98-2127.xml index= 0\n","Going through P98-2143.xml index= 0\n","Going through P98-2173.xml index= 0\n","Going through P98-2177.xml index= 0\n","Going through P98-2180.xml index= 0\n","Going through P98-2182.xml index= 0\n","Going through P98-2204.xml index= 0\n","Going through P99-1004.xml index= 0\n","Going through P99-1008.xml index= 0\n","Going through P99-1014.xml index= 0\n","Going through P99-1016.xml index= 0\n","Going through P99-1032.xml index= 0\n","Going through P99-1041.xml index= 0\n","Going through P99-1042.xml index= 0\n","Going through P99-1048.xml index= 0\n","Going through P99-1059.xml index= 0\n","Going through P99-1065.xml index= 0\n","Going through P99-1067.xml index= 0\n","Going through P99-1068.xml index= 0\n","Going through P99-1069.xml index= 0\n","Going through P99-1071.xml index= 0\n","Going through S10-1010.xml index= 0\n","Going through S10-1011.xml index= 0\n","Going through S12-1053.xml index= 0\n","Going through W00-0403.xml index= 0\n","Going through W00-0712.xml index= 0\n","Going through W00-0717.xml index= 0\n","Going through W00-0726.xml index= 0\n","Going through W00-0730.xml index= 0\n","Going through W00-1201.xml index= 0\n","Going through W00-1303.xml index= 0\n","Going through W00-1308.xml index= 0\n","Going through W00-1401.xml index= 0\n","Going through W00-1427.xml index= 0\n","Going through W01-0501.xml index= 0\n","Going through W01-0511.xml index= 0\n","Going through W01-0513.xml index= 0\n","Going through W01-0514.xml index= 0\n","Going through W01-0521.xml index= 0\n","Going through W01-1313.xml index= 0\n","Going through W01-1605.xml index= 0\n","Going through W02-0109.xml index= 0\n","Going through W02-0301.xml index= 0\n","Going through W02-0505.xml index= 0\n","Going through W02-0603.xml index= 0\n","Going through W02-0817.xml index= 0\n","Going through W02-0902.xml index= 0\n","Going through W02-0908.xml index= 0\n","Going through W02-1001.xml index= 0\n","Going through W02-1006.xml index= 0\n","Going through W02-1011.xml index= 0\n","Going through W02-1018.xml index= 0\n","Going through W02-1021.xml index= 0\n","Going through W02-1028.xml index= 0\n","Going through W02-1039.xml index= 0\n","Going through W02-1210.xml index= 0\n","Going through W02-1502.xml index= 0\n","Going through W02-1503.xml index= 0\n","Going through W02-2016.xml index= 0\n","Going through W02-2018.xml index= 0\n","Going through W02-2024.xml index= 0\n","Going through W02-2026.xml index= 0\n","Going through W03-0301.xml index= 0\n","Going through W03-0404.xml index= 0\n","Going through W03-0405.xml index= 0\n","Going through W03-0407.xml index= 0\n","Going through W03-0419.xml index= 0\n","Going through W03-0424.xml index= 0\n","Going through W03-0425.xml index= 0\n","Going through W03-0428.xml index= 0\n","Going through W03-0430.xml index= 0\n","Going through W03-0501.xml index= 0\n","Going through W03-1006.xml index= 0\n","Going through W03-1008.xml index= 0\n","Going through W03-1011.xml index= 0\n","Going through W03-1014.xml index= 0\n","Going through W03-1017.xml index= 0\n","Going through W03-1028.xml index= 0\n","Going through W03-1508.xml index= 0\n","Going through W03-1719.xml index= 0\n","Going through W03-1728.xml index= 0\n","Going through W03-1730.xml index= 0\n","Going through W03-1809.xml index= 0\n","Going through W03-1810.xml index= 0\n","Going through W03-1812.xml index= 0\n","Going through W04-0308.xml index= 0\n","Going through W04-0803.xml index= 0\n","Going through W04-0807.xml index= 0\n","Going through W04-0811.xml index= 0\n","Going through W04-1013.xml index= 0\n","Going through W04-1221.xml index= 0\n","Going through W04-2319.xml index= 0\n","Going through W04-2401.xml index= 0\n","Going through W04-2406.xml index= 0\n","Going through W04-2407.xml index= 0\n","Going through W04-2609.xml index= 0\n","Going through W04-2705.xml index= 0\n","Going through W04-3103.xml index= 0\n","Going through W04-3111.xml index= 0\n","Going through W04-3201.xml index= 0\n","Going through W04-3205.xml index= 0\n","Going through W04-3206.xml index= 0\n","Going through W04-3207.xml index= 0\n","Going through W04-3208.xml index= 0\n","Going through W04-3212.xml index= 0\n","Going through W04-3213.xml index= 0\n","Going through W04-3219.xml index= 0\n","Going through W04-3230.xml index= 0\n","Going through W04-3236.xml index= 0\n","Going through W04-3237.xml index= 0\n","Going through W04-3239.xml index= 0\n","Going through W04-3247.xml index= 0\n","Going through W04-3250.xml index= 0\n","Going through W04-3252.xml index= 0\n","Going through W04-3253.xml index= 0\n","Going through W05-0602.xml index= 0\n","Going through W05-0625.xml index= 0\n","Going through W05-0904.xml index= 0\n","Going through W05-0909.xml index= 0\n","Going through W05-1203.xml index= 0\n","Going through W05-1506.xml index= 0\n","Going through W05-1513.xml index= 0\n","Going through W06-0301.xml index= 0\n","Going through W06-1203.xml index= 0\n","Going through W06-1606.xml index= 0\n","Going through W06-1607.xml index= 0\n","Going through W06-1615.xml index= 0\n","Going through W06-1616.xml index= 0\n","Going through W06-1639.xml index= 0\n","Going through W06-1642.xml index= 0\n","Going through W06-1651.xml index= 0\n","Going through W06-1670.xml index= 0\n","Going through W06-2501.xml index= 0\n","Going through W06-2915.xml index= 0\n","Going through W06-2920.xml index= 0\n","Going through W06-2922.xml index= 0\n","Going through W06-2932.xml index= 0\n","Going through W06-2933.xml index= 0\n","Going through W06-3105.xml index= 0\n","Going through W06-3108.xml index= 0\n","Going through W06-3114.xml index= 0\n","Going through W06-3119.xml index= 0\n","Going through W06-3601.xml index= 0\n","Going through W06-3808.xml index= 0\n","Going through W06-3812.xml index= 0\n","Going through W07-0403.xml index= 0\n","Going through W07-0702.xml index= 0\n","Going through W07-0717.xml index= 0\n","Going through W07-0718.xml index= 0\n","Going through W07-0733.xml index= 0\n","Going through W07-0734.xml index= 0\n","Going through W07-1401.xml index= 0\n","Going through W07-1604.xml index= 0\n","Going through W07-2002.xml index= 0\n","Going through W07-2006.xml index= 0\n","Going through W07-2009.xml index= 0\n","Going through W07-2012.xml index= 0\n","Going through W07-2014.xml index= 0\n","Going through W07-2016.xml index= 0\n","Going through W07-2018.xml index= 0\n","Going through W07-2216.xml index= 0\n","Going through W08-0309.xml index= 0\n","Going through W08-0336.xml index= 0\n","Going through W08-0509.xml index= 0\n","Going through W08-1301.xml index= 0\n","Going through W08-2102.xml index= 0\n","Going through W08-2121.xml index= 0\n","Going through W08-2123.xml index= 0\n","Going through W09-0401.xml index= 0\n","Going through W09-0424.xml index= 0\n","Going through W09-0432.xml index= 0\n","Going through W09-0441.xml index= 0\n","Going through W09-1105.xml index= 0\n","Going through W09-1119.xml index= 0\n","Going through W09-1304.xml index= 0\n","Going through W09-1401.xml index= 0\n","Going through W10-0204.xml index= 0\n","Going through W10-0701.xml index= 0\n","Going through W10-1703.xml index= 0\n","Going through W10-2805.xml index= 0\n","Going through W10-2903.xml index= 0\n","Going through W10-3001.xml index= 0\n","Going through W11-0705.xml index= 0\n","Going through W11-1801.xml index= 0\n","Going through W11-1802.xml index= 0\n","Going through W11-1901.xml index= 0\n","Going through W11-1902.xml index= 0\n","Going through W11-2103.xml index= 0\n","Going through W11-2107.xml index= 0\n","Going through W11-2123.xml index= 0\n","Going through W12-3102.xml index= 0\n","Going through W93-0301.xml index= 0\n","Going through W94-0319.xml index= 0\n","Going through W95-0101.xml index= 0\n","Going through W95-0103.xml index= 0\n","Going through W95-0104.xml index= 0\n","Going through W95-0105.xml index= 0\n","Going through W95-0107.xml index= 0\n","Going through W95-0115.xml index= 0\n","Going through W96-0102.xml index= 0\n","Going through W96-0208.xml index= 0\n","Going through W96-0213.xml index= 0\n","Going through W96-0214.xml index= 0\n","Going through W97-0109.xml index= 0\n","Going through W97-0119.xml index= 0\n","Going through W97-0209.xml index= 0\n","Going through W97-0301.xml index= 0\n","Going through W97-0302.xml index= 0\n","Going through W97-0311.xml index= 0\n","Going through W97-0313.xml index= 0\n","Going through W97-0322.xml index= 0\n","Going through W97-0703.xml index= 0\n","Going through W97-0713.xml index= 0\n","Going through W97-0802.xml index= 0\n","Going through W97-1306.xml index= 0\n","Going through W98-0705.xml index= 0\n","Going through W98-1106.xml index= 0\n","Going through W98-1115.xml index= 0\n","Going through W98-1118.xml index= 0\n","Going through W98-1119.xml index= 0\n","Going through W98-1411.xml index= 0\n","Going through W99-0501.xml index= 0\n","Going through W99-0604.xml index= 0\n","Going through W99-0611.xml index= 0\n","Going through W99-0612.xml index= 0\n","Going through W99-0613.xml index= 0\n","Going through W99-0623.xml index= 0\n","Going through W99-0625.xml index= 0\n","Going through W99-0629.xml index= 0\n","1009\n"]}],"source":["import os\n","import xml.etree.ElementTree as ET\n","import re\n","import pandas as pd\n","# Define the root folder path\n","\n","top_1000_folder = '/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/scisummnet_final_dataset/top1000_complete'\n","subfolders = sorted(os.listdir(top_1000_folder))\n","#df = pd.read_excel('/content/drive/MyDrive/Extractive_summarization/HIPORank/updated_file.xlsx')\n","input_folder = '/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/dataset/inputs'\n","target_folder = '/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/dataset/targets'\n","files = dict()\n","errors = []\n","index = 0\n","# Traverse through each of the 1000 folders\n","for subfolder in subfolders:\n"," subfolder_path = os.path.join(top_1000_folder, subfolder)\n","\n"," if os.path.isdir(subfolder_path):\n"," # Navigate to the documents_xml folder\n"," documents_xml_folder = os.path.join(subfolder_path, 'Documents_xml')\n"," summary_folder = os.path.join(subfolder_path,'summary')\n"," #print(summary_folder)\n"," if os.path.isdir(documents_xml_folder):\n"," # Find the XML file in the documents_xml folder\n"," for file_name in os.listdir(documents_xml_folder):\n"," if file_name.endswith('.xml'):\n"," xml_file_path = os.path.join(documents_xml_folder, file_name)\n"," print(\"Going through \",file_name,\" index= \",index)\n"," # Parse the XML file\n"," tree = ET.parse(xml_file_path)\n"," root = tree.getroot()\n"," #print(str(ET.tostring(root,'utf-8')))\n"," files[file_name] = str(ET.tostring(root,'utf-8'))\n","\n","\n","# for x in files.keys():\n","# print(x +\" => \" + files[x])\n","\n","\n","print(len(files))\n","\n"]},{"cell_type":"code","source":["import os\n","\n","directory = '/path/to/your/directory'\n","\n","if os.access(directory, os.W_OK):\n"," print(\"You have write permission to the directory.\")\n","else:\n"," print(\"You do not have write permission to the directory.\")"],"metadata":{"id":"JyceFY048OI9"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":15,"metadata":{"id":"MlUsBTdubidh","colab":{"base_uri":"https://localhost:8080/"},"outputId":"9182d352-b183-4021-f1e4-bc006fbcb4e1","executionInfo":{"status":"ok","timestamp":1719058391620,"user_tz":-240,"elapsed":13169,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["printing last sentence\n","TnT is freely available to universities and related organizations for research purposes (see http://www.coli.uni-sb.derthorstenAnt).\n","found it!\n","printing last sentence\n","The reduced sentences produced by humans are also provided for comparison.\n","found it!\n","printing last sentence\n","Even moderately long documents typically address several topics or different aspects of the same topic.\n","found it!\n","printing last sentence\n","This work extends ideas that began in collaboration with Rebecca Bruce and Janyce Wiebe.\n","found it!\n","printing last sentence\n","It is to this project that our future parsing work will be devoted.\n","found it!\n","printing last sentence\n","TOEFL is taken by foreign students who are applying to US undergraduate and graduate-level programs.\n","found it!\n","printing last sentence\n","There is a big gap between the summaries produced by current automatic summarizers and the abstracts written by human professionals.\n","found it!\n","printing last sentence\n","This paper presents three trainable systems for surface natural language generation (NLG).\n","found it!\n","printing last sentence\n","This simple semantic annotation was the only source of task knowledge used to configure the model.\n","found it!\n","printing last sentence\n","This boolean condition is then used to train an improved parser.\n","found it!\n","printing last sentence\n","However, a considerably larger corpus would be required to overcome the sparse data problem for other RSA alternations.\n","found it!\n","printing last sentence\n","The proposed method omitted only 5 of 243 noun phrase brackets in the appendix.\n","found it!\n","printing last sentence\n","In Section 8, we address the problem of portability, and wind up by discussing some shortcomings of Joyce in the conclusion.\n","found it!\n","printing last sentence\n","Reusable The effort required to retarget a tagger to new corpora, new tagsets, and new languages should be minimal.\n","found it!\n","printing last sentence\n","We have presented a simple part of speech tagger which performs as well as existing stochastic taggers, but has significant advantages over these taggers.\n","found it!\n","printing last sentence\n","We have shown that terminology research provides a good application for robust natural language technology, in particular for part-of-speech tagging and word-alignment algorithms.\n","found it!\n","printing last sentence\n","From the observations in the previous section, we propose the following guidelines for how to train a HMM for use in tagging: able, use BW re-estimation with standard convergence tests such as perplexity.\n","found it!\n","printing last sentence\n","Ultimately, a multi-engine system depends on the quality of each particular engine.\n","found it!\n","printing last sentence\n","We would also like to thank the anonymous reviewers for their helpful insights.\n","found it!\n","printing last sentence\n","Voutilainen and Juha Heikkild created the original ENGCG lexicon.\n","found it!\n","printing last sentence\n","For a description of the annotation tool see section 5.\n","found it!\n","printing last sentence\n","Given the incredibly difficult nature of many NLP tasks, this example of a learned, stochastic approach to name-finding lends credence to the argument that the NLP community ought to push these approaches, to find the limit of phenomena that may be captured by probabilistic, finite-state methods.\n","found it!\n","printing last sentence\n","Sections 5-7 elaborate on Nominator's disambiguation heuristics.\n","printing last sentence\n","Systems that generate natural language output as part of their interaction with a user have become a major area of research and development.\n","found it!\n","printing last sentence\n","Predicate subcategorization is a key component of a lexical entry, because most, if not all, recent syntactic theories 'project' syntactic structure from the lexicon.\n","printing last sentence\n","Explo i t ing a Probabi l ist ic Hierarchical Mode l for Generat ion Srinivas Bangalore and Owen Rambow AT&T Labs Research 180 Park Avenue F lorham Park, NJ 07932 {sr in?, rambow}@research, a r t .\n","printing last sentence\n","Effects of Adjective Orientation and Gradability on Sentence Subjectivity Vas i le ios Hatz ivass i log lou Depar tment o1 Computer Sc ience Co lumbia Un ivers i l y New York, NY 10027 vh@cs , co lumbia , edu Janyce M.\n","found it!\n","printing last sentence\n","The Automated Acquisit ion of Topic Signatures for Text Summarizat ion Chin -Yew L in and Eduard Hovy In fo rmat ion S(:i(umes I l l s t i tu te Un ivers i ty of Southern Ca l i fo rn ia Mar ina del Rey, CA 90292, USA { cyl,hovy }C~isi.edu Abst rac t In order to produce, a good summary, one has to identify the most relevant portions of a given text.\n","found it!\n","printing last sentence\n","Automatic Acquisition of Domain Knowledge for Information Extraction Roman Yangarber, Ralph Grishman Past Tapanainen Courant Inst i tute of Conexor oy Mathemat ica l Sciences Helsinki, F in land New York University {roman [ grishman}@cs, nyu.\n","found it!\n","printing last sentence\n","More accurate tes ts Ibr the s ta t i s t i ca l s ign i f i cance of resu l t d i f ferences * Alexander Yeh Mitre Corp.\n","found it!\n","printing last sentence\n","A Compar i son of A l ignment Mode ls for S ta t i s t i ca l Mach ine Trans la t ion Franz Josef Och and Hermann Ney Lehrstuhl fiir Informatik VI, Comlmter Science Department RWTH Aachen - University of Technology D-52056 Aachen, Germany {och, ney}~inf ormat ik.\n","found it!\n","printing last sentence\n","The higher performance of our method can be attributed to the enormity of the web data used and the employment of the EM Algorithm.\n","found it!\n","printing last sentence\n","Our SVM-based NE recognizer attained F = 90.03%.\n","found it!\n","printing last sentence\n","Semantic knowledge for particular domains isincreasingly important in NLP.\n","found it!\n","printing last sentence\n","We presented a clustering algorithm, CBC, for automatically discovering concepts from text.\n","found it!\n","printing last sentence\n","The Penn Chinese Treebank (CTB) is an ongoing project, with its objective being to create a segmented Chinese corpus annotated with POS tags and syntactic brackets.\n","found it!\n","printing last sentence\n","This paper presents a machine learning approach to question classification.\n","found it!\n","printing last sentence\n","The LinGO Redwoods Treebank Motivation and Preliminary Applications Stephan Oepen, Kristina Toutanova, Stuart Shieber, Christopher Manning, Dan Flickinger, and Thorsten Brants {oe |kristina |manning |dan}@csli.stanford.edu, shieber@deas.harvard.edu, brants@parc.xerox.com Abstract The LinGO Redwoods initiative is a seed activity in the de- sign and development of a new type of treebank.\n","found it!\n","printing last sentence\n","The conversion of the Penn Tree bank to dependency trees has been performed using head rules kindly provided by Hiroyasu Yamada and Yuji Matsumoto.\n","found it!\n","printing last sentence\n","(The rule A Section 7 discusses the advantages of the new architecture, Sec tion 8 describes experimental results, and Section 9 summarises the paper.\n","printing last sentence\n","This research was supported by EPSRC grant GR/M96889, and a Commonwealth scholarship and a Sydney University Travelling scholarship to the second author.\n","found it!\n","printing last sentence\n","We remain, however, responsible for all content.\n","found it!\n","printing last sentence\n","In this paper, we studied language model adaptation for statistical machine translation.\n","found it!\n","printing last sentence\n","We conclude this paper and discuss future directions in Section 5.\n","found it!\n","printing last sentence\n","In the future, we will consider making an increase the context-size, which helped Toutanova et al (2003).\n","found it!\n","printing last sentence\n","This indicates that CRFs are a viable model for robust Chinese word segmentation.\n","found it!\n","printing last sentence\n","There is a long standing need for higher quality performance in NLP systems.\n","found it!\n","printing last sentence\n","In Sec tion 6, we consider the effects that this has on a potential application of distributional similarity techniques, which is judging compositionality of collocations.\n","found it!\n","printing last sentence\n","In our approach the creation of the semantic representations forms a completely It could cost taxpayers 15 million to install and residents 1 million a year to maintain NP The levels of accuracy and robustness recently achieved by statistical parsers (e.g.\n","printing last sentence\n","In our experiments, we used the commer cial ILP package (Xpress-MP, 2003), and were able to process roughly twenty sentences per second.\n","found it!\n","printing last sentence\n","Sentiment recognition is a challenging and difficult part of understanding opinions.\n","found it!\n","printing last sentence\n","Our results are summarised in Table 4, where we show the mean ratings for our system (Abstract), the baseline (Extract), and the gold standard.\n","found it!\n","printing last sentence\n","Rachele De Felice was supported by an AHRC scholar ship for the duration of her studies.\n","found it!\n","printing last sentence\n","We presented a HMM POS tagger for fine-grained tagsets which splits the POS tags into attributevectors and estimates the conditional probabilities of the attributes with decision trees.\n","found it!\n","printing last sentence\n","We presented two approaches for unsupervised ac quisition of unary entailment rules from regular (non-comparable) corpora.\n","found it!\n","printing last sentence\n","We wouldalso like to acknowledge the three anonymous reviewers and Derrick Higgins for their helpful com ments and feedback.\n","found it!\n","printing last sentence\n","In this paper, we have described a uniform approach to analogies, synonyms, antonyms, and as sociations, in which all of these phenomena are subsumed by analogies.\n","found it!\n","printing last sentence\n","4 4We provide the Parser and Hash Kernel as open source for download from http://code.google.com/p/mate-tools.\n","found it!\n","printing last sentence\n","In this paper, we presented a novel large-scale par allel dataset PWKP for sentence simplification.\n","found it!\n","printing last sentence\n","Twitter is one of the most popular social network websites and has been growing at a very fast pace.\n","found it!\n","printing last sentence\n","Automated and manual evaluation protocols and results are presented in Section 5, followed by a short discussion.\n","found it!\n","printing last sentence\n","D-PATR: A Deve lopment Env i ronment fo r Un i f i ca t ion -Based Grammars Lauri Karttunen Artificial Intelligence Center SRI International 333 Ravenswood Avenue Menlo Park, CA 94025 USA and Center for the Study of Language and Information Stanford University 1 Introduction I)-PATR is a development environment for unification-based grammars on Xerox l i00 series work stations.\n","found it!\n","printing last sentence\n","Order Variat ion Worder order variation has always been one of the hardest problems for categorial grammars.\n","found it!\n","printing last sentence\n","A STATISTICAL APPROACH TO LANGUAGE TRANSLAT ION P.\n","found it!\n","printing last sentence\n","Parsing Strategies with Lexicalized Grammars: Appl icat ion to Tree Adjoining Grammars * Yves SCHABES, Anne ABE ILLE**and Arav ind K.\n","found it!\n","printing last sentence\n","A Uniform Architecture for Parsing and Generation Stuart M.\n","found it!\n","printing last sentence\n","Feature Structures Based Tree Adjoining Grammars 1 K.\n","found it!\n","printing last sentence\n","Automated language understanding requires the determination f the concept which a given use of a word represents, a process referred to as word sense disambiguation (WSD).\n","found it!\n","printing last sentence\n","CONSTRAINT GRAMMAR AS A FRAMEWORK FOR PARSING RUNNING TEXT Fred Karlsson University of Helsinki Department of General Linguistics Hallituskatu 11 SF-00100 Helsinki Finland e-mail: KARLSS?N@FINUH.bitnet 1.\n","found it!\n","printing last sentence\n","Toward Memory--based Translation Satoshi SATO and Ma.koto NAGAO Dept.\n","found it!\n","printing last sentence\n","The synchronous TAG formalism is inherently nondirec- tional.\n","found it!\n","printing last sentence\n","Typed Unification Grammars Martin C.\n","found it!\n","printing last sentence\n","Automatic Processing of Large Corpora fbr the Resolution of Anaphor References Ido Dagan * Alon Itai Computer Science Department Technion, tIaifa, Israel dagan~techunix .b i tnet , i ta i~ cs.technion, ac.il Abstract Manual acquisition of semantic onstraints in broad domains is very expensive.\n","found it!\n","printing last sentence\n","In this paper, we like to raise the ptx~blems and the difficulties in identifying words and suggest the possible solutions.\n","found it!\n","printing last sentence\n","Two-Level Morphology with Composition Lauri Karttunen, Ronald M.\n","found it!\n","printing last sentence\n","A Fast Algorithm for the Generation of Referring Expressions Abst rac t We simplify previous work in the development of algorithms for the generation of referring expre~ sions while at the same time taking account of psy- cholinguistic findings and transcript data.\n","found it!\n","printing last sentence\n","Word-Sense Disambiguation Using Statistical Models of Rogets Categories Trained on Large Corpora David Yarowsky AT&T Bell Laboratories 600 Mountain Avenue Murray Hil l N J, 07974 yarowsky@research.att .com Abst rac t This paper describes a program that disambignates English word senses in unrestricted text using statistical models of the major Rogets Thesaurus categories.\n","printing last sentence\n","Automatic Acquisition of Hyponyms ~om Large Text Corpora Mart i A.\n","found it!\n","printing last sentence\n","A COMPUTATIONAL MODEL OF LANGUAGE DATA ORIENTED PARSING RENS BOlt* Department of Computational I Jnguistics University of Amsterdmn Spuistraat 134 1012 VII Amsterdam The Netherlands rens@alf.let.uva.nl PERFORMANCE: Abstract 1)ata Oriented Parsing (IX)P) is a model where no abstract rules, but language xt~riences in the ti3ru~ of all ,malyzed COlpUS, constitute the basis for langnage processing.\n","found it!\n","printing last sentence\n","SURFACE GRAMMATICAL ANALYSIS FOR THE EXTRACTION OF TERMINOLOGICAL NOUN PHRASES Didier BOURIGAULT Ecole des Hautes Etudes en Sciences Sociales et Electlicit6 de France Direction des Etudes et Recherches 1, avenue du G6n6ral de Gaulle 92141 Clamart Cedex France Tel : +33 1 47 65 50 64 ABSTRACT LEXTER is a software package for extracting terminology.\n","found it!\n","printing last sentence\n","PART-OF-SPEECH TAGGING WITH NEURAL NETWORKS Hehnut Schmid Institute for Computational Linguistics, Azenbergstr.12, 70174 Stuttgart, Germany, schmid@ims.uni-stuttgart.de Topic area: large text corpora, part-of-speech tag- ging, neural networks 1 ABSTRACT Text corpora which are tagged with part-of-speech in- formation are useful in many areas of linguistic re- search.\n","found it!\n","printing last sentence\n","A Stochastic Japanese Morphological Analyzer Using a Forward-DP Backward-A* N-Best Search Algor i thm Masa.aki NAGATA NTT Network Information Systems l~,~bor~ttorics 1-2356 Take, Yokosuka-Shi, Kanagaw~t, 238-03 Japan (tel) 4-81-468-59-2796 (fax) +81-468-59-3428 (e-mail) nagata@nttnly.ntt .\n","found it!\n","printing last sentence\n","Comlex Syntax : Bu i ld ing a Computat iona l Lex icon Ra lph Gr i shm:m, Cather ine Mac leod, and Adam Mcyers Computer Science Depar tment , New York Un ivers i ty 715 Broadw,~y, 7th F loor , New York, NY 10003, U.S.A.\n","found it!\n","printing last sentence\n","This paper presents some implementation details and experimental results.\n","found it!\n","printing last sentence\n","]~{ECOGNI:ZING ]:F:XT GENII.ES Wl r l l S:lb,/l:ll,I,; ~/~I,;II/I(~S USING DISCII .\n","found it!\n","printing last sentence\n","K-vec starts by estimating the lexicon.\n","found it!\n","printing last sentence\n","Prel)ositioual phrase attachment disambiguation is a difficult problem.\n","found it!\n","printing last sentence\n","Word Sense Disambiguation using Conceptual Density Eneko Agirre* Lengoaia eta Sistema Informatikoak saila.\n","found it!\n","printing last sentence\n","Lappin and Leass' algorithm for pronominal anaphora resolution is capable of high accuracy, but requires in- depth, full, syntactic parsing of text.\n","printing last sentence\n","Role of Word Sense Disambiguation i Lexical Acquisition: Predicting Semantics from Syntactic Cues Bonn ie J.\n","found it!\n","printing last sentence\n","Three New Probabi l is t ic Mode ls for Dependency Parsing: An Exploration* J ason M.\n","found it!\n","printing last sentence\n","Message Unders tand ing Conference - 6: A Br ie f H is tory Ralph Grishman Dept.\n","found it!\n","printing last sentence\n","A key issne in modeling the string translation probability Pr(J'~le I) is the question of how we define the correspondence b tween the words of the English sentence and the words of the French sentence.\n","printing last sentence\n","Mot ivat ions and Methods tbr Text Simpli f icat ion R.\n","found it!\n","printing last sentence\n","and EPSRC (Lap ata; grant EP/C538447/1).\n","found it!\n","printing last sentence\n","We described a statistical syntax-based model that softly aligns a question sentence with a candidateanswer sentence and returns a score.\n","found it!\n","printing last sentence\n","Common assumptions about the role and useful ness of word sense disambiguation (WSD) models in full-scale statistical machine translation (SMT) systems have recently been challenged.\n","found it!\n","printing last sentence\n","and Variational Bayes A Bayesian estimator combines a likelihood termP(x|?, ?) and a prior P(?, ?) to estimate the poste rior probability of a model or hidden state sequence.\n","found it!\n","printing last sentence\n","This work was funded in part by the DARPA GALE program under a subcontract to SRI International.\n","found it!\n","printing last sentence\n","Several kinds of Natural Language Processing systems need measures of semantic relatedness for arbitrary wordpairs.\n","found it!\n","printing last sentence\n","Luke Zettlemoyer was funded by a Microsoft graduateresearch fellowship and Michael Collins was sup ported by the National Science Foundation under grants 0347631 and DMS-0434222.\n","found it!\n","printing last sentence\n","We have presented the HDP-PCFG, a nonparametric Bayesian model for PCFGs, along with an efficient variational inference algorithm.\n","found it!\n","printing last sentence\n","Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning, pp.\n","found it!\n","printing last sentence\n","We would also like to thank the critical and insightful comments from the four anonymous reviewers.\n","found it!\n","printing last sentence\n","We used the Penn Chinese Treebank guidelines (Xueet al, 2005) in searching for a suitable set of reordering rules.\n","found it!\n","printing last sentence\n","We exploited a large number of binary features for statistical machine translation.\n","found it!\n","printing last sentence\n","Given a source-language (e.g., French) sentence f ,the problem of machine translation is to automatically produce a target-language (e.g., English) translation e?.\n","found it!\n","printing last sentence\n","We presented an extension of the state-of-the-artphrase-based approach to statistical machine trans lation that allows the straight-forward integration of additional information, may it come from linguistic tools or automatically acquired word classes.\n","found it!\n","printing last sentence\n","Finally, we want to thank the following people,who in different ways assisted us in the organi zation of the CoNLL 2007 shared task: Giuseppe Attardi, Eckhard Bick, Matthias Buch-Kromann,Xavier Carreras, Tomaz Erjavec, Svetoslav Mari nov, Wolfgang Menzel, Xue Nianwen, Gertjan van Noord, Petya Osenova, Florian Schiel, Kiril Simov, Zdenka Uresova, and Heike Zinsmeister.\n","found it!\n","printing last sentence\n","In the multilingual track of the CoNLL 2007 shared task on dependency parsing, a single parser must be trained to handle data from ten different languages: Arabic (Hajic?\n","found it!\n","printing last sentence\n","The author was supported by the Catalan Ministry of Innovation, Universities and Enterprise.\n","found it!\n","printing last sentence\n","This material is based upon work supported by the Defense Advanced Research Projects Agency (DARPA) under Contract No.\n","found it!\n","printing last sentence\n","We close with a discussion that describes several applications of our work (?7).\n","found it!\n","printing last sentence\n","over ones like ?six shooter.?\n","found it!\n","printing last sentence\n","This work was supported in part by Grant-in-Aid for Specially Promoted Re search 18002007.\n","found it!\n","printing last sentence\n","The explosive increase in Web communication hasattracted increasing interest in technologies for automatically mining personal opinions from Web doc uments such as product reviews and weblogs.\n","found it!\n","printing last sentence\n","System combination has been applied successfully to various machine translation tasks.\n","found it!\n","printing last sentence\n","This work was partially supported by a National Science Foundation grant IIS#0840608.\n","found it!\n","printing last sentence\n","Computational linguists worry constantly about runtime.\n","found it!\n","printing last sentence\n","The quest for a precise definition of text quality— pinpointing the factors that make text flow and easy to read—has a long history and tradition.\n","printing last sentence\n","Paraphrases are alternative ways of expressing the same information.\n","found it!\n","printing last sentence\n","When combined with our previous work on forest-based decoding, it achieves a 2.5 BLEU points improvement over the baseline, and even outperforms the hierarchical system of Hiero by 0.7 points.\n","found it!\n","printing last sentence\n","We describe our training algorithm in section 2; our generalization of Marton and Resnik’s soft syntactic constraints in section 3; our novel structural distortion features in section 4; and experimental results in section 5.\n","printing last sentence\n","This work was supported in part by the Disruptive Technology Office (DTO)’s Advanced Question Answering for Intelligence (AQUAINT) Phase III Program.\n","printing last sentence\n","This work is partly supported by NSF grant SoD-HCER-0613885 and a grant from Boeing.\n","found it!\n","printing last sentence\n","Any opinions, findings, and conclusions or recommendations expressed above are those of the authors and do not necessarily reflect the views of the NSF.\n","found it!\n","printing last sentence\n","Probabilistic models now play a central role in computational linguistics.\n","found it!\n","printing last sentence\n","We developed a graph-based and a transition-based projective dependency parser using beam-search, demonstrating that beam-search is a competitive choice for both parsing approaches.\n","found it!\n","printing last sentence\n","We hope that our approach will provide some insight into the design of lattice-based search procedures along with the use of non-linear, global loss functions such as BLEU.\n","found it!\n","printing last sentence\n","This paper introduces the first unsupervised coreference resolution system that is as accurate as supervised systems.\n","found it!\n","printing last sentence\n","Many statistical methods in natural language processing aim at minimizing the probability of sentence errors.\n","found it!\n","printing last sentence\n","We are also interested in investigating ways to apply the generative model to the inverse task: generation of a NL sentence that explains a given MR structure.\n","found it!\n","printing last sentence\n","We also thank Eric Breck, Lillian Lee, Mats Rooth, the members of the Cornell NLP reading seminar, and the EMNLP reviewers for insightful comments on the submitted version of the paper.\n","found it!\n","printing last sentence\n","We finally show in Section 6 that our ap proach yields results that are significantly better thanprevious approaches for two language pairs and dif ferent test sets.\n","found it!\n","printing last sentence\n","Furthermore, by using this joint parsing technique to preprocess the input to a syntactic MT system, we obtain a 2.4 BLEU improvement.\n","found it!\n","printing last sentence\n","This paper introduces the first unsupervised approach to learning semantic parsers.\n","found it!\n","printing last sentence\n","We implement the expectation and variance semirings in Joshua (Li et al., 2009a), and demonstrate their practical benefit by using minimumrisk training to improve Hiero (Chiang, 2007).\n","found it!\n","printing last sentence\n","And despite its generative semantics, we show that Labeled LDA is competitive with a strong baseline discriminative classifier on two multi-label text classification tasks (Section 7).\n","found it!\n","printing last sentence\n","Conventional wisdom holds that manual evaluation of machine translation is too time-consuming and expensive to conduct.\n","found it!\n","printing last sentence\n","The main choice in the approach is the partitioning of f(x, y) into components r1(x, y) ...\n","found it!\n","printing last sentence\n","Finally, we present experiments on cross-lingual parser projection in conditions when no target language trees are available for training (§5) and when some trees are available (§6).\n","printing last sentence\n","By linking topics across languages, polylingual topic models can increase cross-cultural understanding by providing readers with the ability to characterize the contents of collections in unfamiliar languages and identify trends in topic prevalence.\n","found it!\n","printing last sentence\n","We proposed a highly scalable term similarity algorithm, implemented in the MapReduce framework, and deployed over a 200 billion word crawl of the Web.\n","found it!\n","printing last sentence\n","We thank the three anonymous reviewers for their invaluable comments on the paper.\n","found it!\n","printing last sentence\n","The resolution of entity reference is influenced by a variety of constraints.\n","found it!\n","printing last sentence\n","Specifically, we make the following contributions: Ambiguity resolution is a central task in Natural Language Processing.\n","printing last sentence\n","In Section 4, we present related work and Section 5 concludes the paper.\n","found it!\n","printing last sentence\n","Dynamic programming algorithms have been remarkably useful for inference in many NLP problems.\n","found it!\n","printing last sentence\n","Domain adaptation is a common concern when optimizing empirical NLP applications.\n","found it!\n","printing last sentence\n","We also thank Nicholas Rizzolo and Dan Roth for helping us replicate their experimental setup, and Heng Ji and Dekang Lin for providing their gender lexicon.\n","found it!\n","printing last sentence\n","Section 8 concludes by sketching directions for further work.\n","found it!\n","printing last sentence\n","This paper has presented a method for inducing probabilistic CCGs from sentences paired with logical forms.\n","found it!\n","printing last sentence\n","Any opinions, findings, conclusions, or recommendations expressed in this paper are those of the authors, and do not necessarily reflect the views of the funding organizations.\n","found it!\n","printing last sentence\n","Sociolinguistics and dialectology study how language varies across social and regional contexts.\n","found it!\n","printing last sentence\n","Non-projective dependency parsing is useful for many languages that exhibit non-projective syntactic structures.\n","found it!\n","printing last sentence\n","We presented a simple, yet effective approach for projecting parsers from languages with labeled training data to languages without any labeled training data.\n","found it!\n","printing last sentence\n","After describing the model in detail, we evaluate it qualitatively by analyzing the learned n-gram vector representations and compare quantitatively against other methods on standard datasets and the EP dataset.\n","found it!\n","printing last sentence\n","Statistical Machine Translation (SMT) system performance is dependent on the quantity and quality of available training data.\n","found it!\n","printing last sentence\n","The task consists of deciding, given a text (T) and an hypothesis (H) in different languages, if the meaning of H can be inferred from the meaning of T.\n","found it!\n","printing last sentence\n","Thanks also to the anonymous reviewers, especially the reviewer who implemented PRO during the review period and replicated our results.\n","found it!\n","printing last sentence\n","Support from EPSRC grant EP/F042728/1 is gratefully acknowledged by M.\n","found it!\n","printing last sentence\n","We describe related work in §4 and conclude in §5.\n","printing last sentence\n","Section 6 concludes with a summary and discussion of future work.\n","found it!\n","printing last sentence\n","In this paper we systematically compared three types of distributional representation and their effect on semantic composition.\n","found it!\n","printing last sentence\n","We have presented the first system for joint partof-speech tagging and labeled dependency parsing with non-projective dependency trees.\n","found it!\n","printing last sentence\n","Compared to Bod (2001), our results show an 11% improvement in terms of relative error reduction and a speedup which reduces the processing time from 220 to 3.6 seconds per WSJ sentence.\n","found it!\n","printing last sentence\n","In this paper we describe how co-training (Blum and Mitchell, 1998) can be used to bootstrap a pair of statistical parsers from a small amount of annotated training data.\n","found it!\n","printing last sentence\n","We plan to use this stronger form of information using Pair Hidden Markov Models as described in (Clark, 2001).\n","found it!\n","printing last sentence\n","We would like to thank Joshua Goodman, Miles Osborne, Andrew Smith, Hanna Wallach, Tara Murphy and the anonymous reviewers for their comments on drafts of this paper.\n","found it!\n","printing last sentence\n","For the second and third, we provide differently prepared training corpora to statistical machine translation systems.\n","found it!\n","printing last sentence\n","w+ C P q;k subject to: w ((q; q:e) As illustrated in Section 1, the same proper name may refer to more than one named entity.\n","printing last sentence\n","In this work we describe a novel technique for computing a consensus translation from the outputs of multiple machine translation systems.\n","found it!\n","printing last sentence\n","Dependency representations of sentences (Hudson, 1984; Me´lˇcuk, 1988) model head-dependent syntactic relations as edges in a directed graph.\n","printing last sentence\n","This research is partially supported by the Presto Space EU Project#: FP6-507336.\n","found it!\n","printing last sentence\n","Opinion mining is a recent subdiscipline of computational linguistics which is concerned not with the topic a document is about, but with the opinion it expresses.\n","found it!\n","printing last sentence\n","This paper contributes to the development of NLP and semantic tagging systems in several respects.\n","found it!\n","printing last sentence\n","Section 6 will conclude the paper and give an outlook on possible future work.\n","found it!\n","printing last sentence\n","Finally, we discuss appropriate uses for Bleu and suggest that for some research projects it may be preferable to use a focused, manual evaluation instead.\n","found it!\n","printing last sentence\n","The ability to compress sentences grammatically with minimal information loss is an important problem in text summarization.\n","found it!\n","printing last sentence\n","Many thanks to John Carroll, Roger Evans and the anonymous reviewers for very helpful comments.\n","found it!\n","printing last sentence\n","In this paper, we propose TroFi (Trope Finder), a nearly unsupervised clustering method for separating literal and nonliteral usages of verbs.\n","found it!\n","printing last sentence\n","The term idiom has been applied to a fuzzy category with prototypical examples such as by and large, kick the bucket, and let the cat out of the bag.\n","found it!\n","printing last sentence\n","Our long term goal is to populate databases and ontologies by extracting information from large text collections such as Medline.\n","found it!\n","printing last sentence\n","Finally, we draw some conclusions in Section 8.\n","found it!\n","printing last sentence\n","Sense induction is the task of discovering automatically all possible senses of an ambiguous word.\n","found it!\n","printing last sentence\n","In the last few years, so called finite-state morphology, in general, and two-level morphology in particular, have become widely accepted as paradigms for the computational treatment of morphology.\n","found it!\n","printing last sentence\n","In other words, given a theory qc and a sentence S, S is provable from T if $ rd(pc1( 2)).\n","found it!\n","printing last sentence\n","In this paper we sketch an approach to machine translation that offers several advantages compared to many of the other strategies currently being pursued.\n","found it!\n","printing last sentence\n","We report on experiments which show the difference in performance between the NE system with gazetteers of different sizes for three types of named entities: people, organisations and locations.\n","found it!\n","printing last sentence\n","Word classes are often used in language modelling to solve the problem of sparse data.\n","found it!\n","printing last sentence\n","Some more room for improved performance lies in computing the POS tags in the data with a better tagger than presently used.\n","found it!\n","printing last sentence\n","Inducing Multilingual Text Analysis Tools via Robust Projection across Aligned Corpora David Yarowsky Dept.\n","found it!\n","printing last sentence\n","A good performance metric should have the following two properties: A working definition of coreference resolution is partitioning the noun phrases we are interested in into equiv alence classes, each of which refers to a physical entity.We adopt the terminologies used in the Automatic Con tent Extraction (ACE) task (NIST, 2003a) and call eachindividual phrase a mention and equivalence class an en tity.\n","printing last sentence\n","Finally, our method scales to large numbers of training sentences and trains in minutes rather than hours or days for thehigher-numbered IBM models, a particular ad vantage when not using features derived from those slower models.\n","found it!\n","printing last sentence\n","Bilingual word alignment is the first step of most current approaches to statistical machine translation.Although the best performing systems are ?phrase based?\n","found it!\n","printing last sentence\n","This paper presented a word aligner trained on anno tated data.\n","found it!\n","printing last sentence\n","Wealso note the prior work of Wu (1996), closely re lated to Tillmann?s model.\n","found it!\n","printing last sentence\n","The remainder of this paper is organized as follows: Section 2 introduces the basic terminology, Section 3 gives an overview of OPINE, describes and evaluates its main components, Section 4 describes related work and Section 5 presents our conclusion.\n","found it!\n","printing last sentence\n","Sentiment analysis is the task of identifying positive and negative opinions, emotions, and evaluations.\n","found it!\n","printing last sentence\n","The resulting system identifies opinionsources with 79.3% precision and 59.5% recall using a head noun matching measure, and 81.2% pre cision and 60.6% recall using an overlap measure.\n","found it!\n","printing last sentence\n","The method for automatically finding the predominant sense beat SemCor consistently in our experiments.\n","found it!\n","printing last sentence\n","Although SVMs do not output probabilities, theeasiest-first method would be easily applied by considering the margins output by SVMs as the confi dence of local classification.\n","found it!\n","printing last sentence\n","This work has been supported by NSF ITR grants 0205448 and 0428193.\n","found it!\n","printing last sentence\n","The authors take sole re sponsibility for the work.\n","found it!\n","printing last sentence\n","Are there subsets of the test suitethat are more suited to any particular textual en tailment recognition method?\n","found it!\n","printing last sentence\n","We have presented a new kernel for relation extraction based on the shortest-path between the two rela tion entities in the dependency graph.\n","found it!\n","printing last sentence\n","Proceedings of HLT/EMNLP 2005 Demonstration Abstracts, pages 34?35, Vancouver, October 2005.\n","found it!\n","printing last sentence\n","Identifying Word Correspondences in Parallel Texts William A.\n","found it!\n","printing last sentence\n","A Procedure for Quantitatively Comparing the Syntactic Coverage of English Grammars E.\n","found it!\n","printing last sentence\n","Towards History-based Grammars: Using Richer Models for Probabil ist ic Parsing* Ezra Black Fred Jelinek John Lafferty David M.\n","found it!\n","printing last sentence\n","One Sense Per D iscourse William A.\n","found it!\n","printing last sentence\n","CORPUS-BASED STAT IST ICAL SENSE RESOLUTION Claudia Leacock, 1 Geoffrey Towell, 2 Ellen Voorhees 2 1Princeton University, Cognitive Science Laboratory, Princeton, New Jersey 08542 2Siemens Corporate Research, Inc., Princeton, New Jersey 08540 ABSTRACT The three corpus-based statistical sense resolution methods studied here attempt o infer the correct sense of a polyse- mous word by using knowledge about patterns of word co- occurrences.\n","found it!\n","printing last sentence\n","ONE SENSE PER COLLOCATION David Yarowsky* Department of Computer and In format ion Science Univers i ty of Pennsy lvania Philadelphia, PA 19104 yarowsky@unagi .c is .upenn.edu ABSTRACT Previous work [Gale, Church and Yarowsky, 1992] showed that with high probability a polysemous word has one sense per discourse.\n","found it!\n","printing last sentence\n","A SEMANTIC CONCORDANCE George A.\n","found it!\n","printing last sentence\n","THE PENN TREEBANK: ANNOTATING PREDICATE ARGUMENT STRUCTURE Mitchell Marcus, Grace Kim, Mary Ann Marcinkiewicz, Robert MacIntyre, Ann Bies, Mark Ferguson, Karen Katz, Britta Schasberger Department of Computer and Information Science University of Pennsylvania Philadelphia, PA, USA ABSTRACT The Penn Treebank has recently implemented a new syn- tactic annotation scheme, designed to highlight aspects of predicate-argument structure.\n","found it!\n","printing last sentence\n","USING A SEMANTIC CONCORDANCE FOR SENSE IDENTIFICATION George A.\n","found it!\n","printing last sentence\n","A Maximum Entropy Model for Prepositional Phrase Attachment Adwait Ratnaparkhi, Jeff Reynar,* and Salim Roukos IBM Research D iv is ion Thomas J.\n","found it!\n","printing last sentence\n","A subset of the GENIA corpus is annotated for syntactic (tree) structure.\n","found it!\n","printing last sentence\n","was initially supplied in Big Five/ HKSCS.\n","found it!\n","printing last sentence\n","A Maximum Entropy Approach to Chinese Word Segmentation Jin Kiat Low 1 and Hwee Tou Ng 1,2 and Wenyuan Guo 2 1.\n","found it!\n","printing last sentence\n","Thanks to Kristina Toutanova for her generous help and to Jenny Rose Finkel who devel oped such a great conditional random field package.\n","found it!\n","printing last sentence\n","This is a China book (Chinese book) compounds): I am a student of university (university student) 8.\n","found it!\n","printing last sentence\n","Experimental results are given for applying the training method to translation from English to Spanish and Japanese.\n","found it!\n","printing last sentence\n","Parallel texts (bitexts) have properties that distinguish them from other kinds of parallel data.\n","found it!\n","printing last sentence\n","We achieved good dialogue act labeling accuracy (65% based on errorful, automatically recognized words and prosody, and 71% based on word transcripts, compared to a chance baseline accuracy of 35% and human accuracy of 84%) and a small reduction in word recognition error.\n","found it!\n","printing last sentence\n","Chinese is written without using spaces or other word delimiters.\n","found it!\n","printing last sentence\n","Universidade do Vale do Rio dos Sinos University of Edinburgh We present an implemented system for processing definite descriptions in arbitrary domains.\n","found it!\n","printing last sentence\n","The authors wish to thank Christy Doran, Renate Henschel, Adam Kilgarriff, Paul Piwek, Massimo Poesio, Richard Power, and four anonymous referees for their comments on an earlier draft of this paper.\n","found it!\n","printing last sentence\n","But the assumption made in the text is entirely reasonable, and simplifies the construction for us.\n","found it!\n","printing last sentence\n","The reduction in error rate varies with the material in question, but can be as high as 24.3% with the LOB corpus.\n","found it!\n","printing last sentence\n","Finally, the author would like to express his appreciation to the participants of discussions during meetings of the Brown The author wishes to thank Mark Johnson for invaluable discussion, guidance, and moral support over the course of this project.\n","printing last sentence\n","It is argued that this approach is more likely to assist the creation of practical systems.\n","found it!\n","printing last sentence\n","Automatic acquisition of lexical knowledge is critical to a wide range of natural language processing tasks.\n","found it!\n","printing last sentence\n","We also thank Beth Sundheim for helpful comments on an earlier version of this paper, and Hai Leong Chieu for his implementation of the HMM-based named entity recognition module.\n","found it!\n","printing last sentence\n","This work was completed while the second author was a visiting professor at Harvard University.\n","found it!\n","printing last sentence\n","Helpful comments from the reviewers of Computational Linguistics are also gratefully acknowledged.\n","found it!\n","printing last sentence\n","In addition, the performance of our method is investigated using both the standard Pearson chisquare statistic and the log-likelihood chi-square statistic.\n","found it!\n","printing last sentence\n","This work was primarily funded by National Science Foundation grant ITR/HCI #0086132 to the FrameNet project.\n","found it!\n","printing last sentence\n","The work reported in this article was conducted while both authors were in the HCRC Language Technology Group at the University of Edinburgh.\n","found it!\n","printing last sentence\n","In the Appendix, we present an efficient training algorithm for the alignment models presented.\n","found it!\n","printing last sentence\n","We would like to thank Dennis van Oort and Denis Gerritsen for their help in the implementation and Alexander Koller and Kees van Deemter for some very useful discussions.\n","found it!\n","printing last sentence\n","IBM T.\n","found it!\n","printing last sentence\n","This special issue of Computational Linguistics explores ways in which this dream is being explored.\n","found it!\n","printing last sentence\n","Keezer for permitting and facilitating our use of the Internet Archive.\n","found it!\n","printing last sentence\n","Special thanks are due to Stephen Clark and Detlef Prescher for making their pseudodisambiguation data sets available.\n","found it!\n","printing last sentence\n","My Ph.D.\n","found it!\n","printing last sentence\n","In addition to quantifying performance, we analyze the results to investigate the situations in which the selectional preferences achieve the best precision and in which the one-sense-per-discourse heuristic increases performance.\n","found it!\n","printing last sentence\n","CorMet is a corpus-based system for discovering metaphorical mappings between concepts.\n","found it!\n","printing last sentence\n","Thanks to Janet Cahn and to the anonymous reviewers for comments on earlier drafts.\n","found it!\n","printing last sentence\n","The improvement of the translation results is demonstrated on two German-English corpora taken from the Uerbmobil task and the Nespole!\n","found it!\n","printing last sentence\n","Finally, the clues are used to perform opinion piece recognition (a type of text categorization and genre detection) to demonstrate the utility of the knowledge acquired in this article.\n","found it!\n","printing last sentence\n","A phrase-based statistical machine translation approach — the alignment template approach — is described.\n","printing last sentence\n","SBR-89-20239 and DARPA grant no.\n","found it!\n","printing last sentence\n","Finally, thanks to the anonymous reviewers for several useful comments.\n","found it!\n","printing last sentence\n","The resulting resource can be thought of as shallow, in that it does not represent coreference, quantification, and many other higher-order phenomena, but also broad, in that it covers every instance of every verb in the corpus and allows representative statistics to be calculated.\n","found it!\n","printing last sentence\n","A system that can produce informative summaries, highlighting common information found in many online documents, will help Web users to pinpoint information that they need without extensive reading.\n","found it!\n","printing last sentence\n","Thus, our method can be applied with great benefit to language pairs for which only scarce resources are available.\n","found it!\n","printing last sentence\n","Evaluating WordNet-based Measures of Lexical Semantic Relatedness Alexander Budanitsky?\n","found it!\n","printing last sentence\n","Thanks to the anonymous reviewers of Computational Linguistics for their very helpful comments and suggestions.\n","found it!\n","printing last sentence\n","We present a statistical machine translation model that uses hierarchical phrases—phrases that contain subphrases.\n","printing last sentence\n","This article presents an algorithm for translating the Penn Treebank into a corpus of Combinatory Categorial Grammar (CCG) derivations augmented with local and long-range word–word dependencies.\n","printing last sentence\n","This article has shown how to estimate a log-linear parsing model for an automatically extracted CCG grammar, on a very large scale.\n","found it!\n","printing last sentence\n","This article proposes a novel framework for representing and measuring local coherence.\n","found it!\n","printing last sentence\n","We would also like to thank Takashi Ninomiya and Kenji Sagae for their precious support.\n","found it!\n","printing last sentence\n","This research was carried out while all the authors were at Stanford University.\n","found it!\n","printing last sentence\n","The Importance of Syntactic Parsing and Inference in Semantic Role Labeling Vasin Punyakanok??\n","found it!\n","printing last sentence\n","The work has been partially supported by the Swedish Research Council.\n","found it!\n","printing last sentence\n","We are also extremely grateful to the British Library in London, which made accessible to us virtually every paper we needed for this research.\n","found it!\n","printing last sentence\n","Many approaches to automatic sentiment analysis begin with a large lexicon of words marked with their prior polarity (also called semantic orientation).\n","found it!\n","printing last sentence\n","Over the last two decades, there has been much research on paraphrase extraction and generation within a number of research communities in natural language processing, in order to improve the specific application with which that community is concerned.\n","found it!\n","printing last sentence\n","This separation is in line with what is commonly assumed in cognitive science and formal linguistics, and we hope it will contribute to make corpus-based modeling a core part of the ongoing study of semantic knowledge in humans and machines.\n","found it!\n","printing last sentence\n","Heuristics are suggested to decide among the interpretations.\n","found it!\n","printing last sentence\n","The extended formalism makes it easy to describe left extraposition of constituents, an important feature of natural language syntax.\n","found it!\n","printing last sentence\n","Sentences are far more ambiguous than one might have thought.\n","found it!\n","printing last sentence\n","This processing description specifies in these recognition tasks the role of information from the discourse and from the participants' knowledge of the domain.\n","printing last sentence\n","Also, a commercial on-line parser for Japanese language is being built by Intelligent Technology Incorporation, based on the technique developed at CMU.\n","found it!\n","printing last sentence\n","and Center for the Study of Language and Information Stanford University Stanford, CA 94305 The syntactic structure of a sentence often manifests quite clearly the predicate-argument structure and relations of grammatical subordination.\n","found it!\n","printing last sentence\n","The high degree of lexical category ambiguity in languages such as English poses problems for parsing.\n","found it!\n","printing last sentence\n","We claim that any manageable formalism for naturallanguage temporal descriptions will have to embody such an ontology, as will any usable temporal database for knowledge about events which is to be interrogated using natural language.\n","found it!\n","printing last sentence\n","Philadelphia, PA 19104-6389 In this paper, I consider a range of English expressions and show that their context-dependency can be characterized in terms of two properties: 1.\n","found it!\n","printing last sentence\n","The term word association is used in a very particular sense in the psycholinguistic literature.\n","found it!\n","printing last sentence\n","We present an algorithm for generating strings from logical form encodings that improves upon previous algorithms in that it places fewer restrictions on the class of grammars to which it is applicable.\n","found it!\n","printing last sentence\n","There are many ways in which the simple models described in this paper can be improved.\n","found it!\n","printing last sentence\n","The lexical chains also provide a semantic context for interpreting words, concepts, and sentences.\n","found it!\n","printing last sentence\n","The met* method is compared with approaches from artificial intelligence, linguistics, philosophy, and psychology.\n","found it!\n","printing last sentence\n","In this paper, I will discuss four major topics relating to current research in lexical semantics: methodology, descriptive coverage, adequacy of the representation, and the computational usefulness of representations.\n","found it!\n","printing last sentence\n","We will also discuss an application of the approach in a system that computes sense tags for arbitrary texts, even when it is unable to determine a single syntactic or semantic representation for some sentences.\n","found it!\n","printing last sentence\n","I would also like to thank several anonymous reviewers for their careful critiques, the outcome of which was a substantially improved document.\n","found it!\n","printing last sentence\n","IBM T.\n","found it!\n","printing last sentence\n","We are grateful to Barbara Grosz, Kathy McCoy, Cecile Paris, Donia Scott, Karen Sparck Jones, and an anonymous reviewer for their comments on this research.\n","found it!\n","printing last sentence\n","The flourishing renaissance of empiricism in computational linguistics grew out of the experience of the speech recognition community during the 1970s and 1980s.\n","found it!\n","printing last sentence\n","All errors and mistakes remain our responsibility.\n","found it!\n","printing last sentence\n","Much work has been done on the statistical analysis of text.\n","found it!\n","printing last sentence\n","The probability is based on two parameters, the mean and variance of number of foreign characters per English character.\n","found it!\n","printing last sentence\n","This suggests that a distributional approach can provide an approximate solution to parsing problems that, in the worst case, call for complex reasoning.\n","found it!\n","printing last sentence\n","We present an algorithm for aligning texts with their translations that is based only on internal evidence.\n","found it!\n","printing last sentence\n","Natural languages are full of collocations, recurrent combinations of words that co-occur more often than expected by chance and that correspond to arbitrary word usages.\n","found it!\n","printing last sentence\n","Of the 193 verbs listed above, Lerner detects 174 in the untagged version of the Brown Corpus.\n","found it!\n","printing last sentence\n","We describe a series of five statistical models of the translation process and give algorithms for estimating the parameters of these models given a set of pairs of sentences that are translations of one another.\n","found it!\n","printing last sentence\n","Building a Large Annotated Corpus of English: The Penn Treebank Mitchell P.\n","found it!\n","printing last sentence\n","We would also like to thank Mats Rooth, Scott Waterman, and four anonymous reviewers for useful comments and discussion.\n","found it!\n","printing last sentence\n","Here the algorithm tries to combine the constituent to the right of the conjunction with that on the left of the conjunction.\n","found it!\n","printing last sentence\n","Cue phrases are linguistic expressions such as now and well that function as explicit indicators of the structure of a discourse.\n","found it!\n","printing last sentence\n","I also want to thank one of the referees for his judicious comments.\n","found it!\n","printing last sentence\n","Japanese Discourse and the Process of Centering Mar i lyn Walker* University of Pennsylvania Sharon Cotes University of Pennsylvania Masayo I ida t Stanford University This paper has three aims: (1) to generalize a computational ccount of the discourse process called CENTERING, (2) to apply this account o discourse processing in Japanese so that it can be used in computational systems for machine translation or language understanding, and (3) to provide some insights on the effect of syntactic factors in Japanese on discourse interpretation.\n","found it!\n","printing last sentence\n","We are particularly indebted to Danny Bobrow for helpful discussions in the early stages of the research on rewriting systems.\n","found it!\n","printing last sentence\n","We report the results of analyzing 150 test sentences, which are different from the 30 training sentences used in the parameter adjustment, to illustrate the effectiveness of our method.\n","found it!\n","printing last sentence\n","This paper presents an algorithm for identifying the noun phrase antecedents of third person pronouns and lexical anaphors (reflexives and reciprocals).\n","found it!\n","printing last sentence\n","The paper includes a detailed comparative analysis of statistical sense disambiguation methods.\n","found it!\n","printing last sentence\n","Machine Translation Divergences: A Formal Description and Proposed Solution Bonnie J.\n","found it!\n","printing last sentence\n","Thanks are due Dan Jurafsky and Steve Omohundro for extensive discussions on the topics in this paper, and Fernando Pereira for helpful advice and pointers.\n","found it!\n","printing last sentence\n","This paper concerns relationships among focus of attention, choice of referring expression, and perceived coherence of utterances within a discourse segment.\n","found it!\n","printing last sentence\n","We present a detailed case study of this learning method applied to part-of-speech tagging.\n","found it!\n","printing last sentence\n","Collocations are notoriously difficult for non-native speakers to translate, primarily because they are opaque and cannot be translated on a word-by-word basis.\n","found it!\n","printing last sentence\n","The concept of maximum entropy can be traced back along multiple threads to Biblical times.\n","found it!\n","printing last sentence\n","We discuss what is wrong with reliability measures as they are currently used for discourse and dialogue work in computational linguistics and cognitive science, and argue that we would be better off as afield adopting techniques from content analysis.\n","found it!\n","printing last sentence\n","We also thank Chao-Huang Chang, reviewers for the 1994 ACL conference, and four anonymous reviewers for Computational Linguistics for useful comments.\n","found it!\n","printing last sentence\n","This work was completed within the Dialogue Group of the Human Communication Research Centre.\n","found it!\n","printing last sentence\n","Multi-paragraph subtopic segmentation should be useful for many text analysis tasks, including information retrieval and summarization.\n","found it!\n","printing last sentence\n","The need to model the relation between discourse structure and linguistic features of utterances is almost universally acknowledged in the literature on discourse.\n","found it!\n","printing last sentence\n","Some applications of these algorithms in speech recognition are described and illustrated.\n","found it!\n","printing last sentence\n","We discuss a number of examples of how stochastic inversion transduction grammars bring bilingual constraints to bear upon problematic corpus analysis tasks such as segmentation, bracketing, phrasal alignment, and parsing.\n","found it!\n","printing last sentence\n","Using the proposed technique, unknown-word-guessing rule sets were induced and integrated into a stochastic tagger and a rule-based tagger, which were then applied to texts with unknown words.\n","found it!\n","printing last sentence\n","Probabilistic analogues of regular and context-free grammars are well known in computational linguistics, and currently the subject of intensive research.\n","found it!\n","printing last sentence\n","Work on automatic WSD has a history as long as automated language processing generally.\n","found it!\n","printing last sentence\n","Test results are compared with those from manually tagged training examples.\n","found it!\n","printing last sentence\n","We wish to thank Jean Carletta for much help both with designing the experiments and with the analysis of the results.\n","found it!\n","printing last sentence\n","A new method for automatically acquiring case frame patterns from large corpora is proposed.\n","found it!\n","printing last sentence\n","Best-first parsing methods for natural language try to parse efficiently by considering the most likely constituents first.\n","found it!\n","printing last sentence\n","This work was partially supported by NSF grants GER-90-24069, IRI-96-19124, IRI-96-18797, and CDA-96-25374, as well as a grant from Columbia University's Strategic Initiative Fund sponsored by the Provost's Office.\n","printing last sentence\n","We would also like to thank our sponsors at the Department of Defense.\n","found it!\n","printing last sentence\n","I would like to thank Dick Oehrle and Chris Manning, Eugene Charniak and my other colleagues at Brown, and the CL reviewers for their excellent advice in this research.\n","found it!\n","printing last sentence\n","Both the maps and the alignments are available from the Linguistic Data Consortium.' Texts that are available in two languages (bitexts) are becoming more and more plentiful, both in private data warehouses and on publicly accessible sites on the World Wide Web.\n","printing last sentence\n","In this paper, we have proposed novel methods for robust parsing that integrate the flexibility of linguistically motivated lexical descriptions with the robustness of statistical techniques.\n","found it!\n","printing last sentence\n","Considering empirical evidence from a free-word-order language (German) we propose a revision of the principles guiding the ordering of discourse entities in the forward-looking center list within the centering model.\n","found it!\n","printing last sentence\n","We hope this paper will bring about Teitelbaum's wish.\n","printing last sentence\n","Statistical machine translation is a relatively new approach to the long-standing problem of translating human languages by computer.\n","found it!\n","printing last sentence\n","Some works have been proposed to leverage these characteristics, e.g., the study of the relationship between the content and bloggers?\n","found it!\n","printing last sentence\n","A Model-Theoretic Coreference Scoring Schem e Marc Vilain, John Burger, John Aberdeen, Dennis Connolly, Lynette Hirschman The MITRE Corporation 202 Burlington Rd .\n","found it!\n","printing last sentence\n","MITRE: DESCRIPTION OF THE ALEMBIC SYSTEM USED FOR MUC-6 John Aberdeen, John Burger, David Day, Lynette Hirschman, Patricia Robinson, and Marc Vilain The MITRE Corporation 202 Burlington Rd .\n","found it!\n","printing last sentence\n","The stability, resistance to overtraining, the existence of probability estimates and, now, reasonable speed make TBL an excellent candidate for solving classification tasks in general.\n","found it!\n","printing last sentence\n","Reference resolution is an important task for discourse or dialogue processing systems since identity relations between anaphoric textual entities and their antecedents is a prerequisite to the understanding of text or conversation.\n","found it!\n","printing last sentence\n","A preliminary version of this paper appears in (Pedersen, 2001).\n","found it!\n","printing last sentence\n","While significant effort has been expended on the parsing of written text, parsing speech has received relatively little attention.\n","found it!\n","printing last sentence\n","The experiments reported in this paper extend prior research in a number of directions.\n","found it!\n","printing last sentence\n","What is the relation between a person’s knowledge of grammar and that same person’s application of that knowledge in perceiving syntactic structure?\n","printing last sentence\n","We also produce a more embellished parse in which phenomena such as predicate-argument structure, subcategorization and movement are given a probabilistic treatment.\n","found it!\n","printing last sentence\n","Many NLP tasks, such as building machine-readable dictionaries, are dependent on the results of morphological analysis.\n","found it!\n","printing last sentence\n","In addition, we achieve higher accuracy by applying weighted voting of 8-SVM based systems which are trained using distinct chunk representations.\n","found it!\n","printing last sentence\n","Finally, the paper will empirically evaluate two major questions for each of the tasks: Thus tagging models induced from bilingual alignments can be used to improve these very alignments, and hence improve their own training source.\n","printing last sentence\n","Figure 3 shows an example of a lattice and the slotted lattice derived via the process just described.\n","found it!\n","printing last sentence\n","Unlike most problems addressed with machine learning, parsing natural language sentences requires choosing between an unbounded (or even infinite) number of possible phrase structure trees.\n","found it!\n","printing last sentence\n","PCFG parsing algorithms with worst-case cubic-time bounds are well-known.\n","found it!\n","printing last sentence\n","Various researchers have improved the quality of statistical machine translation system with the use of phrase translation.\n","found it!\n","printing last sentence\n","In this paper, we gave a brief introduction of the manual summary evaluation protocol used in the Document Understanding Conference.\n","found it!\n","printing last sentence\n","translational equivalence between their components.\n","found it!\n","printing last sentence\n","We wish to thank Mihai Surdeanu and Marius Pasca from LCC for their contribution to this work.\n","found it!\n","printing last sentence\n","Some of the automatic evaluations we perform are novel as well.\n","found it!\n","printing last sentence\n","Recent work in statistical text summarization has put forward systems that do not merely extract and concatenate sentences, but learn how to generate new sentences from (Summary, Text) tuples.\n","found it!\n","printing last sentence\n","Taku Kudo provided the output of his SVM chunker for the significance test.\n","found it!\n","printing last sentence\n","Improvements in this area will have a significant impact on both semantic and discourse parsing.\n","found it!\n","printing last sentence\n","This is the best automatically learned part-of-speech tagging result known to us, representing an error reduction of 4.4% on the model presented in Collins (2002), using the same data splits, and a larger error reduction of 12.1% from the more similar best previous loglinear model in Toutanova and Manning (2000).\n","found it!\n","printing last sentence\n","The authors thank Dimitra Vergyri, Andreas Stolcke, and Pat Schone for useful discussions during the JHU’02 workshop.\n","printing last sentence\n","Precision And Recall Of Machine Translation\n","found it!\n","printing last sentence\n","Detecting entities, whether named, nominal or pronominal, in unrestricted text is a crucial step toward understanding the text, as it identifies the important conceptual objects in a discourse.\n","found it!\n","printing last sentence\n","We presented some experiments that compare the accuracy and performance of two stochastic parsing systems, the shallow Collins parser and the deep-grammar-based XLE system.\n","found it!\n","printing last sentence\n","Much of natural language work over the past decade has employed probabilistic finite-state transducers (FSTs) operating on strings.\n","found it!\n","printing last sentence\n","The resulting summaries yield 88%match with human-written output, which compares fa vorably to the 69% achieved by the standard ?leading The development and application of computational models of text structure is a central concern in natural language processing.\n","printing last sentence\n","Keller and Lapata (2003) investigated the validity of web counts for a range of predicate-argument bigrams (verbobject, adjective-noun, and noun-noun bigrams).\n","found it!\n","printing last sentence\n","We have started exploring the feasibility of automation and we are collecting additional data sets.\n","found it!\n","printing last sentence\n","Despite the enormous progress in machine translation (MT) due to the use of statistical techniques in recent years, state-of-the-art statistical systems often produce translations with obvious errors.\n","found it!\n","printing last sentence\n","Statistical Machine Translation systems have achieved considerable progress in recent years as seen from their performance on international competitions in standard evaluation tasks (NIST, 2003).\n","found it!\n","printing last sentence\n","The noisy-channel model (Brown et al., 1990) has been the foundation for statistical machine translation (SMT) for over ten years.\n","found it!\n","printing last sentence\n","Sections 7 and 8 discuss the evaluation results and give our observations and conclusions.\n","found it!\n","printing last sentence\n","Automatic, accurate and wide-coverage techniques thatcan annotate naturally occurring text with semantic argu ment structure can play a key role in NLP applications such as Information Extraction, Question Answering and Summarization.\n","found it!\n","printing last sentence\n","Then, we will investigate the degree of monotonicity and present the translation results for three tasks: Verbmobil, Xerox and Canadian Hansards.\n","found it!\n","printing last sentence\n","In a very interesting study of syntax in statistical machine translation, Fox (2002) looks at how well proposed translation models fit actual translation data.\n","found it!\n","printing last sentence\n","Current state of the art concept discovery algorithms generate lists of instances of semantic classes but stop short of labeling the classes with concept names.\n","found it!\n","printing last sentence\n","Research paper search engines, such as CiteSeer (Lawrence et al., 1999) and Cora (McCallum et al., 2000), give researchers tremendous power and convenience in their research.\n","found it!\n","printing last sentence\n","At a recent meeting, we presented name-tagging technology to a potential user.\n","found it!\n","printing last sentence\n","WordNet::Similarity implements measures of similarity and relatedness that are all in some way based on the structure and content of WordNet.\n","found it!\n","printing last sentence\n","Phrase Translation Model, does not affect the performance of IBM Model 1.\n","found it!\n","printing last sentence\n","The three models are combined in a log-linear way, as shown in the following section.\n","found it!\n","printing last sentence\n","Arabic is garnering attention in the NLP community due to its socio-political importance and its linguistic differences from Indo-European languages.\n","found it!\n","printing last sentence\n","As with many other statistical natural language processing tasks, statistical machine translation (Brown et al., 1993) produces high quality results when ample training data is available.\n","found it!\n","printing last sentence\n","During the last five years there has been a surge in work which aims to provide robust textual inference in arbitrary domains about which the system has no expertise.\n","found it!\n","printing last sentence\n","Note that although we expect that better use of language specific knowledge would improve the results, it would defeat one of the goals of this work.\n","found it!\n","printing last sentence\n","We have described an efficient and fully unsupervised method of producing state-of-the-art word alignments.\n","found it!\n","printing last sentence\n","Less unsupervised methods are more likely to be portable to these new domains, since they do not rely as much on existing annotations.\n","found it!\n","printing last sentence\n","The results are somehow surprising, as one would not expect a community-generated categorization to be almost as informative as a well structured lexical taxonomy such as WordNet.\n","found it!\n","printing last sentence\n","Modeling reorderings between languages has been a major challenge for machine translation.\n","found it!\n","printing last sentence\n","The number of the source articles that contained a mention of the hurricane is shown in the right column.\n","found it!\n","printing last sentence\n","Learning, broadly taken, involves choosing a good model from a large space of possible models.\n","found it!\n","printing last sentence\n","Finally, we report on experiments that show the robustness of WASP in Section 6, followed by the conclusion in Section 7.\n","found it!\n","printing last sentence\n","The use of automatic methods for evaluating machine-generated text is quickly becoming mainstream in natural language processing.\n","found it!\n","printing last sentence\n","We plan to study additional variants that these results suggest may be helpful.\n","found it!\n","printing last sentence\n","We describe the OntoNotes methodology and its result, a large multilingual richly-annotated corpus constructed at 90% interannotator agreement.\n","found it!\n","printing last sentence\n","Our approach produces results with accuracy above those of the best individual parsers on both dependency and constituent parsing of the standard WSJ test set.\n","found it!\n","printing last sentence\n","We refer to the training and inference methods described in this section as the Pairwise Model.\n","found it!\n","printing last sentence\n","Finally, Sec tion 6 illustrates these methods in learning Sesotho morphology.\n","found it!\n","printing last sentence\n","Our sentence compression system is freely available for research and educational purposes.\n","found it!\n","printing last sentence\n","Experimental results on Arabic and Chinese to English newswire and newsgroup test data are presented in Section 6.\n","found it!\n","printing last sentence\n","We would like to thank Ray Mooney, Rohit Kate, and the three anonymous reviewers for their comments.\n","found it!\n","printing last sentence\n","Any opinions, findings, and conclusions or recommendations expressed above are those of the authors and do not necessarily reflect the views of the NSF.\n","found it!\n","printing last sentence\n","We are also interested in applying our approach to other related areas such as morphology and transliteration.\n","found it!\n","printing last sentence\n","Acknowledgments We would like to thank Eugene Charniak, Mark Johnson and Noah Smith for helpful discussions and comments.\n","found it!\n","printing last sentence\n","This work constitutes a step towards better understanding of the interaction of selectional preferences and inferences, bridging these two aspects of semantics.\n","found it!\n","printing last sentence\n","We have built a fast user interface for querying the results.\n","found it!\n","printing last sentence\n","Measuring semantic similarity and relatedness between terms is an important problem in lexical semantics.\n","found it!\n","printing last sentence\n","The authors thank the anonymous reviewers and Sylvia Rebholz for helpful comments.\n","found it!\n","printing last sentence\n","We present a smoothing technique for unsupervised PCFG estimation which allows us to explore more sophisticated dependency grammars.\n","found it!\n","printing last sentence\n","What linguistic features can improve statistical machine translation (MT)?\n","found it!\n","printing last sentence\n","In this paper, we present a novel precedence reordering approach based on a dependency parser.\n","found it!\n","printing last sentence\n","We thank Erik Sudderth for suggesting sampling the Pitman-Yor hyperparameters and the ACL reviewers for their insightful comments.\n","found it!\n","printing last sentence\n","We presented a discriminatively trained joint model of parsing and named entity recognition, which improved performance on both tasks.\n","found it!\n","printing last sentence\n","Acknowledgements The authors would like to thank Bob Moore, Chris Brockett, Chris Quirk, and Kristina Toutanova for their useful discussions as well as the reviewers for their helpful comments.\n","found it!\n","printing last sentence\n","Special thanks to Kemal Oflazar and Reyyan Yeniterzi of Sabancı University for providing the Turkish-English corpus and to Philip Resnik, Adam Lopez, Trevor Cohn, and especially Phil Blunsom for their helpful suggestions.\n","printing last sentence\n","Many of the most glaring errors made by today’s statistical machine translation systems are those resulting from confusion of semantic roles.\n","printing last sentence\n","We presented a resource-light model for vectorspace word meaning that represents words as collections of prototype vectors, naturally accounting for lexical ambiguity.\n","found it!\n","printing last sentence\n","Research on the automatic correction of grammatical errors has undergone a renaissance in the past decade.\n","found it!\n","printing last sentence\n","Ultimately, we hope to put the learned conversation structure to use in the construction of a data-driven, conversational agent.\n","found it!\n","printing last sentence\n","Our work joins others in using Wikipedia revisions to learn interesting types of directional lexical relations, e.g, “eggcorns”3 [7] and entailments [8].\n","printing last sentence\n","Coreference systems exploit a variety of information sources, ranging from syntactic and discourse constraints, which are highly configurational, to semantic constraints, which are highly contingent on lexical meaning and world knowledge.\n","found it!\n","printing last sentence\n","For any statistical machine translation system, the size of the parallel corpus used for training is a major factor in its performance.\n","found it!\n","printing last sentence\n","We hope that further work on this non-directional parsing framework will pave the way to better understanding of an interesting cognitive question: which kinds of parsing decisions are hard to make, and which linguistic constructs are hard to analyze?\n","found it!\n","printing last sentence\n","In this paper we examined the viability of sentiment lexicons learned semi-automatically from the web, as opposed to those that rely on manual annotation and/or resources such as WordNet.\n","found it!\n","printing last sentence\n","We have presented three new, large-margin tuning methods for SMT that can handle thousands of features.\n","found it!\n","printing last sentence\n","In the parsing experiments, we use the following data sets.\n","found it!\n","printing last sentence\n","The remainder of this paper is organized as follows: Section 2 describes the proposed method; Section 3 presents experimental results; Section 4 discusses some details of grammar correction evaluation; and Section 5 concludes the paper.\n","found it!\n","printing last sentence\n","We release all of these resources to the research community: asked for your last name so he can add you on Facebook.\n","printing last sentence\n","We have presented a generally applicable vector offset method for identifying linguistic regularities in continuous space word representations.\n","found it!\n","printing last sentence\n","We also report on a preliminary effort towards constructing event chronologies from this data.\n","found it!\n","printing last sentence\n","One of the primary problems that NLP researchers who work in new languages or new domains encounter is a lack of available annotated data.\n","found it!\n","printing last sentence\n","This paper has presented an original algorithm capable of inducing the accurate morphological analysis of even highly irregular verbs, starting with no paired examples for training and no prior seeding of legal morphological transformations.\n","printing last sentence\n","This more powerful model gives significant improvements in accuracy over previous approaches to noisy channel spelling correction.\n","found it!\n","printing last sentence\n","This paper reviews the framework, discusses some of the pros and cons of this approach using examples from our corpus of news wire stories, and presents an initial evaluation.\n","found it!\n","printing last sentence\n","In statistical machine translation we set up a statistical translation model Pr(fillef) which describes the relationship between a source language (SL) string f and a target language (TL) string ef.\n","found it!\n","printing last sentence\n","Why use tree-adjoining grammar for statistical parsing?\n","found it!\n","printing last sentence\n","In the rest of this paper we report on our current system, as well as a number of preliminary experiments on extensions to the system.\n","found it!\n","printing last sentence\n","Q-26 What is the name of the "female" counterpart to El Nino, which results in cooling temperatures and very dry weather ?\n","printing last sentence\n","Machine learning techniques, which automatically learn linguistic information from online text corpora, have been applied to a number of natural language problems throughout the last decade.\n","found it!\n","printing last sentence\n","We thank Noemie Elhadad, Mike Collins, Michael Elhadad and Maria Lapata for useful discussions.\n","found it!\n","printing last sentence\n","All of the most accurate statistical parsers [1,3, 6,7,12,14] are lexicalized in that they condition probabilities on the lexical content of the sentences being parsed.\n","found it!\n","printing last sentence\n","We have increased the generator’s reliability by making the ERG monotonic and we expect further improvements in practical performance once we take full advantage of the restrictions in the grammar to cut down the search space.\n","printing last sentence\n","In computational linguistics, a variety of (statistical) measures have been proposed for identifying lexical associations between words in lexical tuples extracted from text corpora.\n","found it!\n","printing last sentence\n","The first is a fast greedy decoder, and the second is a slow optimal decoder based on generic mathematical programming techniques.\n","found it!\n","printing last sentence\n","Documents usually include various topics.\n","found it!\n","printing last sentence\n","We conclude with Section 4, followed by an Appendix describing the training algorithm in more detail.\n","found it!\n","printing last sentence\n","Rational relations on strings have become widespread in language and speech engineering (Roche and Schabes, 1997).\n","found it!\n","printing last sentence\n","The web results easily outperform the TREC results.\n","found it!\n","printing last sentence\n","Noun phrase coreference resolution refers to the problem of determining which noun phrases (NPs) refer to each real-world entity mentioned in a document.\n","found it!\n","printing last sentence\n","We have presented a simple generative model for the unsupervised distributional induction of hierarchical linguistic structure.\n","found it!\n","printing last sentence\n","Possibly there is a way to use both skeletal and the original kind of patterns in a single system.\n","found it!\n","printing last sentence\n","We have presented a method for using word pronunciation information to improve spelling correction accuracy.\n","found it!\n","printing last sentence\n","In this paper we have described an infrastructure for language engineering software which aims to assist the develeopment of robust tools and resources for NLP.\n","found it!\n","printing last sentence\n","Over the past decade, most work in the field of information extraction has shifted from complex rule-based, systems designed to handle a wide variety of semantic phenomena including quantification, anaphora, aspect and modality (e.g.\n","found it!\n","printing last sentence\n","The authors would like to thank the anonymous reviewers for their comments, Rebecca Hwa and Okan Kolak for helpful assistance and discussion, Franz Josef Och for his help with GIZA++, Adwait Ratnaparkhi for the use of MXTERMINATOR, and our collaborators at Johns Hopkins for the use of their computing facilities in parts of this work.\n","found it!\n","printing last sentence\n","The candi dates might be enumerated by a number of methods.The experiments in this paper use the top The perceptron algorithm is one of the oldest algorithms in machine learning, going back to (Rosen blatt 1958).\n","printing last sentence\n","Statistical parsing using combined systems of handcoded linguistically fine-grained grammars and stochastic disambiguation components has seen considerable progress in recent years.\n","found it!\n","printing last sentence\n","In statistical alignment models Pr(fJ1 , aJ1 |eI1), the alignment aJ1 is introduced as a hidden variable: We have presented a framework for statistical MT for natural languages, which is more general than the widely used source-channel approach.\n","printing last sentence\n","Section 7 discusses LM issues, and is followed by conclusions.\n","found it!\n","printing last sentence\n","In Section 5, we compare our baseline metric performance with human evaluations.\n","found it!\n","printing last sentence\n","This will be important for applying the parser to tasks such as language modelling, for which the possibility of incremental processing of CCG appears particularly attractive.\n","found it!\n","printing last sentence\n","In order to estimate the conditional probabilitiesof our model, we recursively smooth empirical es timates e?i of specific conditional distributions with(possible smoothed) estimates of less specific distri butions e?i State-of-the-art statistical parsers use many other features, or conditioning variables, such as head words, subcategorization frames, distance measures and grandparent nodes.\n","printing last sentence\n","I suggest that the Yarowsky algorithm is actually based on a different independence assumption, and I show that, if the independence assumption holds, the Yarowsky algorithm is effective at finding a high-precision classifier.\n","found it!\n","printing last sentence\n","We study empirically the adequacy of various features for the task of discourse relation classification and we show that some discourse relations can be correctly recognized with accuracies as high as 93%.\n","found it!\n","printing last sentence\n","This work has been supported, in part, by ONR MUM Contract FCP0.810548265, NSA RD02-5700, DARPA/ITO Cooperative Agreement N660010028910, and Mitre Contract 0104187712.\n","found it!\n","printing last sentence\n","We also compare our system with human translators and a commercial system.\n","found it!\n","printing last sentence\n","The latter difficulty might be addressed by using semantic orientation combined with other features in a supervised classification algorithm.\n","found it!\n","printing last sentence\n","Section 6 contains our remarks and possible extensions of the proposed work.\n","found it!\n","printing last sentence\n","It should be a viable alternative to methods such as the boosting or Markov Random Field algorithms described in previous work.\n","found it!\n","printing last sentence\n","The authors would like to thank Miruna Ticrea for her valuable help with training the classifier.\n","found it!\n","printing last sentence\n","Section 5 summarizes the conclusions.\n","found it!\n","printing last sentence\n","In this paper, we proposed a noisy-channel model for QA that can accommodate within a unified framework the exploitation of a large number of resources and QA-specific techniques.\n","found it!\n","printing last sentence\n","Kernel methods (e.g., Support Vector Machines (Vapnik, 1995)) attract a great deal of attention recently.\n","found it!\n","printing last sentence\n","This paper has presented a novel approach to automatic semantic classification of verbs.\n","found it!\n","printing last sentence\n","Finally, we show that our aligned corpus has attracted people both inside and outside the NLP community.\n","found it!\n","printing last sentence\n","Our loosely tree-based alignment techniques allow statistical models of machine translation to make use of syntactic information while retaining the flexibility to handle cases of non-isomorphic source and target trees.\n","found it!\n","printing last sentence\n","Our experiments show that this model can be an effective tool for improving an existing word alignment.\n","found it!\n","printing last sentence\n","Testing our sister-head model on these languages is a topic for future research.\n","found it!\n","printing last sentence\n","We have described the ITG constraints in detail and compared them to the IBM constraints.\n","found it!\n","printing last sentence\n","In this paper, we investigate methods to efficiently optimize model parameters with respect to machine translation quality as measured by automatic evaluation criteria such as word error rate and BLEU.\n","found it!\n","printing last sentence\n","To our knowledge the work presented here describes the first implemented system for corpus-based anaphora resolution dealing also with non-NP-antecedents.\n","found it!\n","printing last sentence\n","In this paper we have proposed a competition learning approach to coreference resolution.\n","found it!\n","printing last sentence\n","In this paper, we explored alternative models for the automatic acquisition of extraction patterns.\n","found it!\n","printing last sentence\n","Given the comparison results, we can say with confidence that our system achieves at least the performance of state-of-the-art word segmentation systems.\n","found it!\n","printing last sentence\n","The work described in this paper is motivated by research into automatic pattern acquisition.\n","found it!\n","printing last sentence\n","Our work adopts major components of the algorithm from (Luo & Roukos 1996): language model (LM) parameter estimation from a segmented corpus and input segmentation on the basis of LM probabilities.\n","printing last sentence\n","The advantages of unlexicalized grammars are clear enough – easy to estimate, easy to parse with, and time- and space-efficient.\n","printing last sentence\n","The trends we obtained are different enough from previous work to merit discussion.\n","found it!\n","printing last sentence\n","The task of word sense disambiguation (WSD) is to determine the correct meaning, or sense of a word in context.\n","found it!\n","printing last sentence\n","The author was supported by EPSRC grant number R40036.\n","found it!\n","printing last sentence\n","We close with discussions and conclusions.\n","found it!\n","printing last sentence\n","In this paper, we introduce a method of detecting learners’ errors, and we examine to what extent this could be accomplished using our learner corpus data including error tags that are labeled with the learners’ errors.\n","printing last sentence\n","We are now prepared to discuss the synchronous case.\n","found it!\n","printing last sentence\n","Since the two approaches seem to have different strengths, a combined model may outperform both of them.\n","found it!\n","printing last sentence\n","Much recent work has investigated the application of discriminative methods to NLP tasks, with mixed results.\n","found it!\n","printing last sentence\n","Finally, the oracle results suggest that further experimentation with the supertagger will significantly improve parsing accuracy, efficiency and robustness.\n","found it!\n","printing last sentence\n","Overall, these results show much promise in the use of discriminative learning techniques such as the perceptron algorithm to help perform heuristic search in difficult domains such as statistical parsing.\n","found it!\n","printing last sentence\n","We compare our approach with some recent work in Section 6.\n","found it!\n","printing last sentence\n","We expect to see the proposed model to be further explored in other related areas.\n","found it!\n","printing last sentence\n","The computational treatment of opinion, sentiment, and subjectivity has recently attracted a great deal of attention (see references), in part because of its potential applications.\n","found it!\n","printing last sentence\n","We experiment with several WordNet Similarity measures (Patwardhan and Pedersen, 2003) which aim to capture semantic relatedness within The first sense heuristic which is often used as a baseline for supervised WSD systems outperforms many of these systems which take surrounding context into account.\n","printing last sentence\n","The determination of syntactic structure is an important step in natural language processing as syntactic structure strongly determines semantic interpretation in the form of predicate-argument struc ture, dependency relations or logical form.\n","found it!\n","printing last sentence\n","Other studies may relate to the use of SCF to generate verb clusters.\n","found it!\n","printing last sentence\n","Finally, we conclude with future work.\n","found it!\n","printing last sentence\n","It is worthwhile to characterize relation types that are better captured by the sparse kernel, and to determine when using the sparse kernel is worth the increased computational burden.\n","found it!\n","printing last sentence\n","We have presented an approach to collective information extraction that uses Relational Markov Networks to reason about the mutual influences between multiple extractions.\n","found it!\n","printing last sentence\n","It demonstrates that the broad constituent and dependency structure of a language can be recovered quite successfully (individually or, more effectively, jointly) from a very modest amount of training data.\n","found it!\n","printing last sentence\n","We have demonstrated that it is possible to improve the performance of Model 1 in terms of alignment error by about 30%, simply by changing the way its parameters are estimated.\n","found it!\n","printing last sentence\n","Another interesting work is to study when to stop active learning.\n","found it!\n","printing last sentence\n","Using objective functions to automatically evaluate machine translation quality is not new.\n","found it!\n","printing last sentence\n","A parser is an algorithm for inferring the structure of its input, guided by a grammar that dictates what structures are possible or probable.\n","found it!\n","printing last sentence\n","One of the main features of meetings is the occurrence of agreement and disagreement among participants.\n","found it!\n","printing last sentence\n","We obtained our best results when we combined a variety of features.\n","found it!\n","printing last sentence\n","In supervised learning applications, one can often find a large amount of unlabeled data without difficulty, while labeled data are costly to obtain.\n","found it!\n","printing last sentence\n","We conducted four sets of experiments.\n","found it!\n","printing last sentence\n","Experiments of the parsing of realworld sentences can properly evaluate the effectiveness and possibility of parsing models for HPSG.\n","found it!\n","printing last sentence\n","This work was supported by NSF ITR grants 0205456, 0205448, and 0428193.\n","found it!\n","printing last sentence\n","In section 5, we then evaluate the entire parsing system by training and evaluating on data from the Prague Dependency Treebank.\n","found it!\n","printing last sentence\n","There has recently been a dramatic surge of interest in sentiment analysis, as more and more people become aware of the scientific challenges posed and the scope of new applications enabled by the processing of subjective language.\n","found it!\n","printing last sentence\n","We empirically show that the proposed method works well even with a small number of seed words.\n","found it!\n","printing last sentence\n","A key requirement for any system that produces text is the coherence of its output.\n","found it!\n","printing last sentence\n","In our experiments, our approach compares favorably to two state-of-the-art coreference systems adopting the standard machine learning approach, outperforming them by as much as 4–7% on the three data sets for one of the performance metrics.\n","printing last sentence\n","Finally thanks to the National Science Foundation for its support (NSF IIS-0112432, NSF 9721276, and NSF DMS-0074276).\n","found it!\n","printing last sentence\n","The alignment template translation model (Och and Ney, 2004) and related phrase-based models advanced the previous state of the art by moving from words to phrases as the basic unit of translation.\n","found it!\n","printing last sentence\n","Following up on ideas introduced by (Cherry & Lin, 03) we plan to explore ways to leverage the dependency tree to improve alignment quality.\n","printing last sentence\n","We have created a supervised version of the noisychannel model with some improvements over the K&M model.\n","printing last sentence\n","We discuss future work (§6) and conclude (§7).\n","printing last sentence\n","Statistical context free grammars provide another example of statistical models which are restricted to limiting local structure, and which could benefit from modeling nonlocal structure.\n","found it!\n","printing last sentence\n","Results on each of these evaluation regimes are then presented (Sections 6 and 7).\n","found it!\n","printing last sentence\n","Information extraction subsumes a broad range of tasks, including the extraction of entities, relations and events from various text sources, such as newswire documents and broadcast transcripts.\n","found it!\n","printing last sentence\n","With the dramatic increase in the amount of textual information available in digital archives and the WWW, there has been growing interest in techniques for automatically extracting information from text.\n","found it!\n","printing last sentence\n","We will follow with our experimental results and conclusion and close with a discussion of possible future directions.\n","found it!\n","printing last sentence\n","We presented the formal description of a Stochastic Lexicalized Inversion Transduction Grammar with its EM training procedure, and proposed specially designed pruning and smoothing techniques.\n","found it!\n","printing last sentence\n","Section 6 provides a summary and description of future work.\n","found it!\n","printing last sentence\n","While our experiments are on German, other languages have word orders that are very different from English, so we believe our methods will be generally applicable.\n","found it!\n","printing last sentence\n","Future work includes a full-fledged version of SDIG and a more sophisticated MT pipeline with possibly a tri-gram language model for decoding.\n","found it!\n","printing last sentence\n","Arabic is a morphologically complex language.1 The morphological analysis of a word consists of determining the values of a large number of (orthogonal) features, such as basic part-of-speech (i.e., noun, verb, and so on), voice, gender, number, information about the clitics, and so on.2 For Arabic, this gives us about 333,000 theoretically possible completely specified morphological analyses, i.e., morphological tags, of which about 2,200 are actually used in the first 280,000 words of the Penn Arabic Treebank (ATB).\n","found it!\n","printing last sentence\n","A simple combination of these representations did lead to improved performance.\n","found it!\n","printing last sentence\n","This is especially true when we model the dependencies with discriminative models capable of incorporating long-distance features.\n","found it!\n","printing last sentence\n","Paraphrases are alternative ways of conveying the same information.\n","found it!\n","printing last sentence\n","In the last decade, the field of Natural Language Processing (NLP), has seen a surge in the use of corpus motivated techniques.\n","found it!\n","printing last sentence\n","Other extensions of this work are to collect more text marked-up with emoticons, and to experiment with techniques to automatically remove noisy examples from the training data.\n","found it!\n","printing last sentence\n","We wish to thank Robert Frederking, Ralf Brown and Jaime Carbonell for their valuable input and suggestions.\n","found it!\n","printing last sentence\n","We will explore how the interaction between the generation and segmentation components can improve the performance of such a system as a whole.\n","found it!\n","printing last sentence\n","Pronoun resolution is a difficult but vital part of the overall coreference resolution task.\n","found it!\n","printing last sentence\n","Finally, we conclude in Section 7.\n","found it!\n","printing last sentence\n","We also thank three anonymous reviewers for ACL06.\n","found it!\n","printing last sentence\n","Giving special attention to such cases should help get rid of these errors, and improve the precision of the method.\n","found it!\n","printing last sentence\n","Finally, the method presented here could be useful for lexicographers in the comparison of the quality of dictionaries, and in the detection of missing word senses.\n","found it!\n","printing last sentence\n","It remains to be seen whether one could enrich existing ontologies with relations harvested by Espresso, and it is our hope that these relations will benefit NLP applications.\n","found it!\n","printing last sentence\n","In particular, we expect to be looking into alternative word alignment models and possibly enhancing our system’s decoder using some of the richer, more structured language models that are beginning to emerge.\n","printing last sentence\n","We are planning an evaluation according to this measure after improving the merge stage.\n","found it!\n","printing last sentence\n","Modern statistical parsers require treebanks to train their parameters, but their performance declines when one parses genres more distant from the training data’s domain.\n","printing last sentence\n","By using a split-and-merge strategy and beginning with the barest possible initial structure, our method reliably learns a PCFG that is remarkably good at parsing.\n","found it!\n","printing last sentence\n","In this paper we presented a MaxEnt-based phrase reordering model for SMT.\n","found it!\n","printing last sentence\n","Then, we conclude this paper with a discussion in Section 6.\n","found it!\n","printing last sentence\n","In §6 we briefly review contrastive estimation (Smith and Eisner, 2005a), relating it to the new method, and show its performance alone and when augmented with structural bias.\n","printing last sentence\n","As a result, the task of our decoder is to find the best target string while Galley’s is to seek the most likely target tree.\n","printing last sentence\n","Our main result is that best performance is obtained when learning segmentation and morpheme tagging in one step, which is made possible by an appropriate text representation.\n","found it!\n","printing last sentence\n","Computational Linguistics, 27(3):351–372.\n","printing last sentence\n","Finally, some discussion and future work is presented in Section 5.\n","found it!\n","printing last sentence\n","The growing interest in practical NLP applications such as question-answering and text summarization places increasing demands on the processing of temporal information.\n","found it!\n","printing last sentence\n","We presented a semi-supervised algorithm based on IBM Model 4, with modeling and search extensions, which produces alignments of improved F-measure over unsupervised Model 4 training.\n","found it!\n","printing last sentence\n","We have presented an algorithm for inducing semantic taxonomies which attempts to globally optimize the entire structure of the taxonomy.\n","found it!\n","printing last sentence\n","Named Entity recognition has been getting much attention in NLP research in recent years, since it is seen as significant component of higher level NLP tasks such as information distillation and question answering.\n","found it!\n","printing last sentence\n","We conclude our work and indicate the future work in Section 6.\n","found it!\n","printing last sentence\n","The problem of bootstrapping syntactic structure from unlabeled data has regained considerable interest.\n","found it!\n","printing last sentence\n","Finally, Section 5 provides a discussion of our findings, and Section 6 summarizes our conclusions.\n","found it!\n","printing last sentence\n","We presented a new kernel-based approach to learn semantic parsers.\n","found it!\n","printing last sentence\n","Finally, we show that our contextually richer rules provide a 3.63 BLEU point increase over those of (Galley et al, 2004).\n","found it!\n","printing last sentence\n","Allowing a single gap in bilingual phrases or other types of constituent can improve coverage dramatically.\n","found it!\n","printing last sentence\n","Probabilistic language models are used extensively in a variety of linguistic applications, including speech recognition, handwriting recognition, optical character recognition, and machine translation.\n","found it!\n","printing last sentence\n","This work was partially supported by ARDA AQUAINT and by the NSF (award IIS-0208798).\n","found it!\n","printing last sentence\n","Section 6 concludes the paper.\n","found it!\n","printing last sentence\n","§5 discusses these results and proposes further lines of research.\n","printing last sentence\n","We have presented a discriminative, syntactic word alignment method.\n","found it!\n","printing last sentence\n","The work of Joakim Nivre is partially supported by the Swedish Research Council.\n","found it!\n","printing last sentence\n","In this paper, we proposed “On-demand Information Extraction (ODIE)”.\n","printing last sentence\n","Finally, we note the connections of minimum risk training to max-margin training and minimum Bayes risk decoding (§7), and recapitulate our results (§8).\n","printing last sentence\n","Assigning syntactic categories to words is an important pre-processing step for most NLP applications.\n","found it!\n","printing last sentence\n","Finally, the training and tuning of the parse ranking model has been made more flexible.\n","found it!\n","printing last sentence\n","In light of the need to reconcile word alignments with phrase structure trees for syntactic MT, we have proposed an HMM-like model whose distortion is sensitive to such trees.\n","found it!\n","printing last sentence\n","It is not intuitively clear why the SMT system can learn something from its own output and is improved through semi-supervised learning.\n","found it!\n","printing last sentence\n","We have shown that WSD improves the translation performance of a state-of-the-art hierarchical phrase-based statistical MT system and this improvement is statistically significant.\n","found it!\n","printing last sentence\n","In natural language, a word often assumes different meanings, and the task of determining the correct meaning, or sense, of a word in different contexts is known as word sense disambiguation (WSD).\n","found it!\n","printing last sentence\n","Thus we envision forest rescoring as being of general applicability for reducing complicated search spaces, as an alternative to simulated annealing methods (Kirkpatrick et al., 1983).\n","found it!\n","printing last sentence\n","Acknowledgements Many thanks to Jason Baldridge, Razvan Bunescu, Stefan Evert, Ray Mooney, Ulrike and Sebastian Pad6, and Sabine Schulte im Walde for helpful discussions.\n","found it!\n","printing last sentence\n","In this paper we have addressed a novel type of problem: given a specific concept, discover in fully unsupervised fashion, a range of relations in which it participates.\n","found it!\n","printing last sentence\n","On the other hand, the precision on NML and JJP constituents was quite high, so the parser is able to identify at least some of the structure very well.\n","found it!\n","printing last sentence\n","Hence, we conclude that accurate, large-scale, linguistically-motivated NLP is now practical with CCG.\n","found it!\n","printing last sentence\n","Finally, we compare our framework with related work in Section 5 before we conclude in Section 6.\n","found it!\n","printing last sentence\n","Natural Language Processing (NLP) systems typically require large amounts of knowledge to achieve good performance.\n","found it!\n","printing last sentence\n","SMT practitioners have on the whole found it difficult to integrate syntax into their systems.\n","found it!\n","printing last sentence\n","System combination has been shown to improve classification performance in various tasks.\n","found it!\n","printing last sentence\n","Grammar induction, the learning of the grammar of a language from unannotated example sentences, has long been of interest to linguists because of its relevance to language acquisition by children.\n","found it!\n","printing last sentence\n","Furthermore, extensions to the sentence-document model were discussed and it was argued that a nested hierarchical structure would be beneficial since it would allow for efficient inference algorithms.\n","found it!\n","printing last sentence\n","We are also actively searching for a larger and more varied set of domains on which to test our techniques.\n","found it!\n","printing last sentence\n","We presented two techniques for query expansion in answer retrieval that are based on SMT technology.\n","found it!\n","printing last sentence\n","The framework presented here shows that with some consideration for its workings, the randomised nature of the Bloom filter need not be a significant impediment to is use in applications.\n","found it!\n","printing last sentence\n","We would like to thank the anonymous reviewers for their helpful suggestions.\n","found it!\n","printing last sentence\n","This paper proposes a novel, probabilistic approach to reordering which combines the merits of syntax and phrase-based SMT.\n","found it!\n","printing last sentence\n","In this paper we have presented a novel method for obtaining more reliable translation estimates from small datasets.\n","found it!\n","printing last sentence\n","We hope that our success with POS tagging will inspire further research into Bayesian methods for other natural language learning tasks.\n","found it!\n","printing last sentence\n","Many NLP tasks can be modeled as a sequence classification problem, such as POS tagging, chunking, and incremental parsing.\n","found it!\n","printing last sentence\n","Our results show that PAS and syntactic parsing are promising methods to address tasks affected by data sparseness like question/answer categorization.\n","found it!\n","printing last sentence\n","We proposed a word-based CWS model using the discriminative perceptron learning algorithm.\n","found it!\n","printing last sentence\n","Referring to an entity in natural language can broadly be decomposed into two processes.\n","found it!\n","printing last sentence\n","We have presented A-WASP, a semantic parsing algorithm based on a A-SCFG that generates logical forms using A-calculus.\n","found it!\n","printing last sentence\n","A similar method can therefore be used to derive tools for subjectivity analysis in other languages.\n","found it!\n","printing last sentence\n","The automatic processing of scientific papers using NLP and machine learning (ML) techniques is an increasingly important aspect of technical informatics.\n","found it!\n","printing last sentence\n","This paper has presented a suite of open-source tools which we believe will be of value to the MT research community.\n","found it!\n","printing last sentence\n","Section 6 considers related work, which is then followed by a discussion of future work.\n","found it!\n","printing last sentence\n","The last author was supported by NSF IIS-0546554.\n","found it!\n","printing last sentence\n","We would also like to thank Chris Quirk for inspirations, Yang Liu for help with rule extraction, Mark Johnson for posing the question of virtual ∞-best list, and the anonymous reviewers for suggestions.\n","printing last sentence\n","Statistical machine translation (SMT) has seen a resurgence in popularity in recent years, with progress being driven by a move to phrase-based and syntax-inspired approaches.\n","found it!\n","printing last sentence\n","In this paper we presented a general framework for vector-based semantic composition.\n","found it!\n","printing last sentence\n","Identifying events of a particular type within individual documents – ‘classical’ information extraction – remains a difficult task.\n","printing last sentence\n","We conclude in Section 4 with an examination of related work.\n","found it!\n","printing last sentence\n","The work of the second author as well as collaboration visits to Israel was financed by NWO, grant number 017.001.271.\n","found it!\n","printing last sentence\n","Finally, we conclude our work in Section 7.\n","found it!\n","printing last sentence\n","The well-formedness of the dependency structures enables efficient decoding through dynamic programming.\n","found it!\n","printing last sentence\n","We believe this general framework could also be applied to other problems involving forests or lattices, such as sequence labeling and machine translation.\n","found it!\n","printing last sentence\n","The authors thank the anonymous reviewers for their insightful comments.\n","found it!\n","printing last sentence\n","Our results may encourage the adoption of the SSL method for many other real world applications.\n","found it!\n","printing last sentence\n","For centuries, the deep connection between human languages has fascinated linguists, anthropologists and historians (Eco, 1995).\n","found it!\n","printing last sentence\n","We have demonstrated that unsupervised POS tagging can reach good results using the robust EMHMM learner when provided with good initial conditions, even with incomplete dictionaries.\n","found it!\n","printing last sentence\n","In this paper, we have introduced an efficient, distributed clustering algorithm for obtaining word classifications for predictive class-based language models with which we were able to use billions of tokens of training data to obtain classifications for millions of words in relatively short amounts of time.\n","found it!\n","printing last sentence\n","We have presented a generative model for bilingual lexicon induction based on probabilistic CCA.\n","found it!\n","printing last sentence\n","We have shown that it is possible to learn narrative event chains unsupervised from raw text.\n","found it!\n","printing last sentence\n","We proposed a joint Chinese word segmentation and POS tagging model, which achieved a considerable reduction in error rate compared to a baseline twostage system.\n","found it!\n","printing last sentence\n","This work was done while L.\n","found it!\n","printing last sentence\n","In this paper, we have demonstrated how the two dominant approaches to data-driven dependency parsing, graph-based models and transition-based models, can be integrated by letting one model learn from features generated by the other.\n","found it!\n","printing last sentence\n","This speed-up does not come with a performance cost; we attain an F-score of 90.9%, a 14% relative reduction in errors over previous work on WSJ15.\n","found it!\n","printing last sentence\n","Finally we conclude in Section 7 with a summary and potential directions for future work.\n","found it!\n","printing last sentence\n","This research was supported by the GALE program of the Defense Advanced Research Projects Agency, Contract No.\n","found it!\n","printing last sentence\n","We conducted experiments with four semantic classes, achieving high accuracies and outperforming the results reported by others who have worked on the same classes.\n","found it!\n","printing last sentence\n","Although O is NP-hard, we present an approach to solving it using integer linear programming (ILP).\n","found it!\n","printing last sentence\n","Thanks to the following members of the Stanford NLP reading group for helpful discussion: Sharon Goldwater, Michel Galley, Anna Rafferty.\n","found it!\n","printing last sentence\n","We would like to thank the BLLIP team for their comments.\n","found it!\n","printing last sentence\n","These results indicate the power of learning from this new form of automated supervision.\n","found it!\n","printing last sentence\n","Nonetheless, taking the first unsupervised approach to this problem, we were able to make substantial progress: We achieve an F1 of 53.2%, which closes over half of the gap between a heuristic baseline (26%) and supervised systems (68%–80%).\n","printing last sentence\n","In contrast, our MBR algorithm directly selects the hypothesis in the hypergraph with the maximum expected approximate corpus BLEU score (Tromble et al., 2008).\n","found it!\n","printing last sentence\n","We would also like to thank Vladislav D.\n","found it!\n","printing last sentence\n","Sentiment classification is the task of identifying the sentiment polarity of a given text.\n","found it!\n","printing last sentence\n","Xing was supported by NSF DBI0546594, DBI-0640543, IIS-0713379, and an Alfred Sloan Foundation Fellowship in Computer Science.\n","found it!\n","printing last sentence\n","Syntactic parsing using dependency structures has become a standard technique in natural language processing with many different parsing models, in particular data-driven models that can be trained on syntactically annotated corpora (Yamada and Matsumoto, 2003; Nivre et al., 2004; McDonald et al., 2005a; Attardi, 2006; Titov and Henderson, 2007).\n","found it!\n","printing last sentence\n","For English and a handful of other languages, there are large, well-annotated corpora with a variety of linguistic information ranging from named entity to discourse structure.\n","found it!\n","printing last sentence\n","For the rest of this paper, we will limit ourselves to a 2-gram tag model.\n","found it!\n","printing last sentence\n","In Chinese, word segmentation and part-of-speech (POS) tagging are indispensable steps for higherlevel NLP tasks.\n","found it!\n","printing last sentence\n","This work is funded in part by NSF (IIS-0811974).\n","found it!\n","printing last sentence\n","We examine the state-of-the-art in NP coreference resolution.\n","found it!\n","printing last sentence\n","We would like to thank Sasha Blair-Goldensohn for providing us with the TextRels data and for the insightful discussion in the early stages of our work.\n","found it!\n","printing last sentence\n","We have presented a Bayesian model of SCFG induction capable of capturing phrasal units of translational equivalence.\n","found it!\n","printing last sentence\n","This paper proposes a method for statistical paraphrase generation.\n","found it!\n","printing last sentence\n","Inversion transduction grammar (ITG) constraints (Wu, 1997) provide coherent structural constraints on the relationship between a sentence and its translation.\n","found it!\n","printing last sentence\n","Our research was partially funded by the NSF via award IIS0811974 and by Robert Bosch LLC.\n","found it!\n","printing last sentence\n","Our query classifier reaches the same level of performance as the KDDCUP 2005 winning systems, which were built with a great deal of knowledge engineering.\n","found it!\n","printing last sentence\n","These results taken We have shown that using a few syntactic features leads to state-of-the-art accuracy for discourse vs.\n","printing last sentence\n","Acknowledgments This work was supported by NSF grants IIS-0546554 and ITR-0428020.\n","found it!\n","printing last sentence\n","In summary, we make three main contributions: The remainder of this paper is divided as follows: Sections 2 and 3 give background, Sections 4 and 5 describe our new parsing algorithms, Section 6 discusses related work, Section 7 presents our experimental results, and Section 8 concludes.\n","found it!\n","printing last sentence\n","We retract former negative results published in Turian et al.\n","found it!\n","printing last sentence\n","Selectional Preferences encode the set of admissible argument values for a relation.\n","found it!\n","printing last sentence\n","The main conclusions of this study are drawn in Section 6.\n","found it!\n","printing last sentence\n","Statements and opinions expressed do not necessarily reflect the position or the policy of the United States Government, and no official endorsement should be inferred.\n","found it!\n","printing last sentence\n","We thank the three anonymous reviewers for their invaluable comments on an earlier draft of the paper.\n","found it!\n","printing last sentence\n","Though exact tree-to-tree translation tends to hamper translation quality by imposing too many constraints during both grammar extraction and decoding, we have shown that using both source and target syntax improves translation accuracy when the model is given the opportunity to learn from data how strongly to apply syntactic constraints.\n","found it!\n","printing last sentence\n","The cross-entropy difference selection method introduced here seems to produce language models that are both a better match to texts in a restricted domain, and require less data for training, than any of the other data selection methods tested.\n","found it!\n","printing last sentence\n","We show experimentally that cdec uses less memory and time than comparable decoders on a controlled translation task (§7).\n","printing last sentence\n","We would like to thank Matt Callcut for refining the language of this paper, and thank Yuki Arase and the anonymous reviewers for many valuable comments and helpful suggestions.\n","found it!\n","printing last sentence\n","We show experimentally that discriminative models with appropriate feature types can achieve performance close to the upper bound, as defined by the agreement between human examiners on the same test corpus.\n","found it!\n","printing last sentence\n","We thank Chris Brockett, Raymond Mooney, Katrin Erk, Jason Baldridge and the anonymous reviewers for helpful comments on a previous draft.\n","found it!\n","printing last sentence\n","In this paper, we have proposed the task of lexical normalisation for short text messages, as found in Twitter and SMS data.\n","found it!\n","printing last sentence\n","Information-extraction (IE), the process of generating relational data from natural-language text, continues to gain attention.\n","found it!\n","printing last sentence\n","Our system outperforms all existing systems despite using no annotated logical forms.\n","found it!\n","printing last sentence\n","Our results outperform strong unsupervised baselines as well as approaches that rely on direct projections, and bridge the gap between purely supervised and unsupervised POS tagging models.\n","found it!\n","printing last sentence\n","A template defines a specific type of event (e.g., a bombing) with a set of semantic roles (or slots) for the typical entities involved in such an event (e.g., perpetrator, target, instrument).\n","found it!\n","printing last sentence\n","We note that in the BOT evaluation, following (Milne and Witten, 2008b) we consider all the titles within a document, even if some the titles were due to mentions we failed to identify.5 We evaluate GLOW on four data sets, of which two are from previous work.\n","printing last sentence\n","We also believe that the annotated data can be useful for research into domain adaptation and semi-supervised learning.\n","found it!\n","printing last sentence\n","The need for statistical hypothesis testing for machine translation (MT) has been acknowledged since at least Och (2003).\n","found it!\n","printing last sentence\n","Our experiments were performed using the Penn Treebank (PTB) and Chinese Treebank (CTB) data.\n","found it!\n","printing last sentence\n","We show that our multi-prototype model improves upon thesingle-prototype version and outperforms other neu ral language models and baselines on this dataset.\n","found it!\n","printing last sentence\n","Syntactic parsing is a central task in natural language processing because of its importance in mediating between linguistic expression and meaning.\n","found it!\n","printing last sentence\n","The main purpose of the paper was to sort out the confusion about the roles of syntactic, semantic, and pragmatic factors in the interpretation and generation of definite noun phrases in discourse.\n","found it!\n","printing last sentence\n","Bell Laboratories Murray Hill, New Jersey 07974 It is often remarked that natural language, used naturally, is unnaturally ungrammatical.\n","found it!\n","printing last sentence\n","Bell Laboratories Murray Hill, New Jersey 07974 Linguists, including computational linguists, have always been fond of talking about trees.\n","found it!\n","printing last sentence\n","Finally, Section 7 discusses questions of computational complexity and decidability.\n","found it!\n","printing last sentence\n","A complex value is a collection of features, for example: Most schools of linguistics use some type of feature notation in their phonological, morphological, syntactic, and semantic descriptions.\n","printing last sentence\n","A classical translating machine stands with one foot on the input text and one on the output.\n","found it!\n","printing last sentence\n","Its positioning at the center of these trends arises, however, not from the admixture of many discrete techniques, but rather from the application of a single simple yet powerful concept to the encoding of linguistic information.\n","found it!\n","printing last sentence\n","This paper sketches the outline of a discourse grammar which acknowledges several different levels of structure.\n","found it!\n","printing last sentence\n","The real problem in natural language processing is the interpretation of discourse.\n","found it!\n","printing last sentence\n","SOME COMPUTATIONAL PROPERTISS OF TREE ADJO IN ING GRAMM.~.S* K.\n","found it!\n","printing last sentence\n","We have presented a general technique of restriction with many applications in the area of manipulating complexfeature-based grammar formalisms.\n","found it!\n","printing last sentence\n","The success of this approach is dependent on marking missing syntactic constituents as elided and missing semantic roles as ESSENTIAL so that reference resolution can know when to look for referents.\n","found it!\n","printing last sentence\n","Philadelphia, PA 19104 ABSTRACT' A constraint is proposed in the Centering approach to pronoun resolution in discourse.\n","printing last sentence\n","We have studied the structural descriptions (tree sets) that can be assigned by various grammatical systems, and classified these formalisms on the basis of two features: path complexity; and path independence.\n","found it!\n","printing last sentence\n","EXAMPLE: She often beats her.\n","found it!\n","printing last sentence\n","This method overcomes the shortcomings of previously existing methods, and has the following desirable properties: The unification method presented here represents a general solution to a seemingly intractable problem.\n","printing last sentence\n","Deduction is explosive, and since the abduction scheme augments deduction with the assumptions, it is even more explosive.\n","found it!\n","printing last sentence\n","The main result of this exploratory study is the finding that control is a useful parameter for identifying discourse structure.\n","found it!\n","printing last sentence\n","For general comments, all the above, and Cede Paris, Stuart Shapiro, and Norm Sondheimer.\n","found it!\n","printing last sentence\n","The problem of generating a well-formed naturallanguage expression from an encoding of its meaning possesses certain properties which distinguish it from the converse problem of recovering a meaning encoding from a given natural-language expression.\n","found it!\n","printing last sentence\n","COOKING UP REFERRING EXPRESS IONS Robert Dale Centre for Cognitive Science, University of Edinburgh 2 Buccleuch Place, Edinburgh EH8 9LW, Scotland email: rda~uk, ac.\n","found it!\n","printing last sentence\n","What is new is that facilities for the computational storage and analysis of large bodies of natural language have developed significantly in recent years, so that it is now becoming possible to test and apply informal assertions of this kind in a more rigorous way, and to see what company our words do keep.\n","found it!\n","printing last sentence\n","Obviously it would be instructive to conduct a similar analysis on other textual types.\n","found it!\n","printing last sentence\n","The parser treats this information as another set of unary constraints and applies it to the constraint network.\n","found it!\n","printing last sentence\n","Mixed Initiative in Dialogue: An Investigation into Discourse Segmentation Marilyn Walker University of Pennsylvania* Computer Science Dept.\n","found it!\n","printing last sentence\n","Our application domain is the domain of stock market reports and the corpus on which our expertise is based consists of more than 10 million words taken from the Associated Press news wire.\n","found it!\n","printing last sentence\n","Using a similarity metric derived from the distribution of subjects, verbs and objects in a corpus of English text, we have shown the plausibility of deriving semantic relatedness from the distribution of syntactic forms.\n","found it!\n","printing last sentence\n","The resolution of lexical ambiguities in non-restricted text is one of the most difficult tasks of natural language processing.\n","found it!\n","printing last sentence\n","Table 4 shows some interesting examples of this.\n","found it!\n","printing last sentence\n","We thank Susanne Wolff and and Evelyne Tzoulcermann for their pains in aligning sentences.\n","found it!\n","printing last sentence\n","The final two sections provide a brief comparison to related work and draw conclusions.\n","found it!\n","printing last sentence\n","Our attempt to use lexical associations derived from distribution of lexical items in text shows promising results.\n","found it!\n","printing last sentence\n","The French noun interet, for example, is translated into German as either Zins or Interesse according to its sense, but both of these senses are translated into English as interest, and so we make no attempt to distinguish them.\n","found it!\n","printing last sentence\n","Section 7 mentions some benefits of using QLF-like representations in implementing natural language systems.\n","found it!\n","printing last sentence\n","We would like to thank Patti Price for her helpful comments on earlier drafts, as well as for her participation in the development of the notational system used.\n","found it!\n","printing last sentence\n","The second author is partially supported by DARPA Grant N0014-90-31863, ARO Grant DAAL03-89-C-0031 and NSF Grant 111190-16592.\n","found it!\n","printing last sentence\n","Word-sense disambiguation is a long-standing problem in computational linguistics (e.g., Kaplan (1950), Yngve (1955), Bar-Hillel (1960), Masterson (1967)), with important implications for a number of practical applications including text-to-speech (TTS), machine translation (MT), information retrieval (IR), and many others.\n","found it!\n","printing last sentence\n","Char_align has succeeded in meeting many of these goals because it works at the character level and does not depend on finding sentence and/or paragraph boundaries which are surprisingly elusive in realistic applications.\n","found it!\n","printing last sentence\n","However, substantially greater computing power is required before these approaches can become practical, and there is not much room for further improvements in accuracy.\n","found it!\n","printing last sentence\n","The algorithm is robust, and extensible in several ways.\n","found it!\n","printing last sentence\n","The success of the HBG model encourages future development of general history-based grammars as a more promising approach than the usual P-CFG.\n","found it!\n","printing last sentence\n","Gemini is a natural language (NL) understanding system developed for spoken language applications.\n","found it!\n","printing last sentence\n","We have presented an efficient message passing algorithm for principle-based parsing, where ferent places so that stricter principles are applied earlier.\n","found it!\n","printing last sentence\n","Both authors' work was partially supported by DARPA and ONR under contract N00014-89-J-1782; Passonneau was also partly supported by NSF grant IRI-91-13064.\n","printing last sentence\n","Statistical data on word cooccurrence relations play a major role in many corpus based approaches for natural language processing.\n","found it!\n","printing last sentence\n","As natural language processing systems become more oriented towards solving real-world problems like machine translation or spoken language understanding in a limited domain, their need for access to vast amounts of knowledge increases.\n","found it!\n","printing last sentence\n","Methods for automatically classifying words according to their contexts of use have both scientific and practical interest.\n","found it!\n","printing last sentence\n","The desire to combine hand-coded and automatically learned knowledge suggests that we should aim for a high precision learner (even at some cost in coverage), and that is the approach adopted here.\n","found it!\n","printing last sentence\n","There has been a great deal of interest of late in the automatic induction of natural language grammar.\n","found it!\n","printing last sentence\n","Hills and valleys of LCP closely correlate with changing of segments.\n","found it!\n","printing last sentence\n","The structure of expository texts can be characterized as a sequence of subtopical discussions that occur in the context of a few main topic discussions.\n","found it!\n","printing last sentence\n","In the final section, we describe an improved statistical method that also permits domain-specific lexical cues to be incorporated probabilistically.\n","found it!\n","printing last sentence\n","This paper has presented a general-purpose algorithm for lexical ambiguity resolution that is perspicuous, easy to implement, flexible and applied quickly to new domains.\n","found it!\n","printing last sentence\n","The productive applications must be semantically sound, and therefore have to treated individually.\n","found it!\n","printing last sentence\n","Models were generated and tested as described in Section 2.\n","found it!\n","printing last sentence\n","The experiments above demonstrate a number of important points.\n","found it!\n","printing last sentence\n","We will also study the formal properties of DTG, and complete the design of the Earley style parser.\n","found it!\n","printing last sentence\n","In essence, our algorithm works by harnessing several powerful, empirically-observed properties of language, namely the strong tendency for words to exhibit only one sense per collocation and per discourse.\n","found it!\n","printing last sentence\n","A large-scale natural language generation (NLG) system for unrestricted text should be able to operate in an environment of 50,000 conceptual terms and 100,000 words or phrases.\n","found it!\n","printing last sentence\n","Parsing a natural language sentence can be viewed as making a sequence of disambiguation decisions: determining the part-of-speech of the words, choosing between possible constituent structures, and selecting labels for the constituents.\n","found it!\n","printing last sentence\n","As a further step, even with non parallel corpora it should be possible to locate comparable passages of text.\n","found it!\n","printing last sentence\n","In this paper, we have presented a new approach for WSD using an exemplar based learning algorithm.\n","found it!\n","printing last sentence\n","The remainder of the paper is divided into four sections, one describing the overall structure of our models, and one for each of the three major components of parsing, semantic interpretation and discourse.\n","found it!\n","printing last sentence\n","The main contribution of this work has been formal: to establish a normal form for parses of "pure" Cornbinatory Categorial Grammar.\n","printing last sentence\n","This greatly reduces the search space and makes possible a polynomial-time optimization algorithm.\n","found it!\n","printing last sentence\n","Matching parsing algorithms to evaluation criteria is a powerful technique that can be used to improve performance.\n","found it!\n","printing last sentence\n","We have shown that a simple statistical model based on dependencies between words can parse Wall Street Journal news text with high accuracy.\n","found it!\n","printing last sentence\n","There can now be as many edges as bit-vectors and, not surprisingly, the computational complexity of the parsing process increases accordingly.\n","found it!\n","printing last sentence\n","This paper presents empirical support for the assumption long held by computational linguists, that intonation can provide valuable cues for discourse processing.\n","found it!\n","printing last sentence\n","To our knowledge, this is the first empirical comparison of smoothing techniques in language modeling of such scope: no other study has used multiple training data sizes, corpora, or has performed parameter optimization.\n","found it!\n","printing last sentence\n","The first author gratefully acknowledges the support of the Fulbright Foundation.\n","found it!\n","printing last sentence\n","This improves parsing performance, and, more importantly, adds useful information to the parser's output.\n","printing last sentence\n","The experiments indicate that categorization decisions can be made with reasonable accuracy on the basis of surface cues.\n","found it!\n","printing last sentence\n","Our use of similarity measure to relax the correctness criterion provides a possible solution to this problem.\n","found it!\n","printing last sentence\n","We introduced the notion of rhetorical parsing, i.e., the process through which natural language texts are automatically mapped into discourse trees.\n","found it!\n","printing last sentence\n","At that level, human translators find the problem quite difficult as well.\n","found it!\n","printing last sentence\n","We will also extend our analyses to nouns and verbs.\n","found it!\n","printing last sentence\n","Section 2 describes PARADISE's performance model, and Section 3 discusses its generality, before concluding in Section 4.\n","printing last sentence\n","This paper presents a trainable rule-based algorithm for performing word segmentation.\n","found it!\n","printing last sentence\n","In this manner, the model can account for a wider range of translation phenomena.\n","found it!\n","printing last sentence\n","The authors wish to thank Yoram Singer for his collaboration in an earlier phase of this research project, and Giorgio Satta for helpful discussions.\n","found it!\n","printing last sentence\n","In addition, we also describe a scoring algorithm for evaluating the cross-document coreference chains produced by our system and we compare our algorithm to the scoring algorithm used in the MUC-6 (within document) coreference task.\n","found it!\n","printing last sentence\n","At the time of writing, there is something in place for each of the major software components, though in some cases these are little more than stubs or "toy" implementations.\n","printing last sentence\n","In this paper, we showed that the error distributions for three popular state of the art part of speech taggers are highly complementary.\n","found it!\n","printing last sentence\n","This paper presented a new method for identifying base NPs.\n","found it!\n","printing last sentence\n","The main goal of the present work is to develop a language model that uses syntactic structure to model long-distance dependencies.\n","found it!\n","printing last sentence\n","However, we have found interesting parallels in how Portuguese and English treat regular sense extensions.\n","found it!\n","printing last sentence\n","We have devised an algorithm using context seed word TF/IDF for extracting bilingual lexicon from nonparallel, comparable corpus in English-Chinese.\n","found it!\n","printing last sentence\n","But the investigation need not be limited to wordclass tagging, for we expect that there are many other NLP tasks where combination could lead to worthwhile improvements.\n","found it!\n","printing last sentence\n","We informally present a parser in Section 5.\n","found it!\n","printing last sentence\n","This paper reports results from two approaches, one using WordNet and other based on EVCA classes.\n","found it!\n","printing last sentence\n","The meaning of an unknown word can often be inferred from its context.\n","found it!\n","printing last sentence\n","We have described a robust, knowledge-poor approach to pronoun resolution which operates on texts pre-processed by a part-of-speech tagger.\n","found it!\n","printing last sentence\n","WYSIWYM editing is a new idea that requires practical testing.\n","found it!\n","printing last sentence\n","Prepositional phrase attachment is the task of deciding, for a given preposition in a sentence, the attachment site that corresponds to the interpretation of the sentence.\n","found it!\n","printing last sentence\n","As the systems for these languages mature, we will create corresponding MindNets, beginning, as we did in English, with the processing of machine-readable reference materials and then adding information gleaned from corpora.\n","found it!\n","printing last sentence\n","We have outlined an algorithm in this paper that, as it stands, could significantly speed up the task of building a semantic lexicon.\n","found it!\n","printing last sentence\n","In this paper, I proposed a model for determining the hearer's attentional state which is based on the distinction between hearer-old and hearer-new discourse entities.\n","printing last sentence\n","We evaluated the similarity functions introduced in the previous section on a binary decision task, using the same experimental framework as in our previous preliminary comparison (Dagan et al., 1999).\n","found it!\n","printing last sentence\n","Thanks And although WordNet is hand-built, there is general agreement that corpus-based methods have an advantage in the relative completeness of their coverage, particularly when used as supplements to the more laborintensive methods.\n","printing last sentence\n","An important challenge in computational linguistics concerns the construction of large-scale computational lexicons for the numerous natural languages where very large samples of language use are now available.\n","found it!\n","printing last sentence\n","The resulting hierarchy is evaluated by human judges, and future research directions are discussed.\n","found it!\n","printing last sentence\n","This paper demonstrates a procedure for automatically formulating a single best tag when there are multiple judges who disagree.\n","found it!\n","printing last sentence\n","The author wishes to thank ACL reviewers for their helpful comments and suggestions.\n","found it!\n","printing last sentence\n","We used our development corpus to explore several alternative evaluation techniques, and then evaluated on the test set, which was kept blind.\n","found it!\n","printing last sentence\n","These lists are then used to recognize existential NPs in new texts.\n","found it!\n","printing last sentence\n","Lexicalized grammar formalisms are of both theoretical and practical interest to the computational linguistics community.\n","found it!\n","printing last sentence\n","(As a point of comparison, the parser achieves 91% dependency accuracy on English (Wall Street Journal) text.) Much of the recent research on statistical parsing has focused on English; languages other than English are likely to pose new problems for statistical methods.\n","printing last sentence\n","The method described can be seen as a simple case of the gradient descent method proposed by Rapp (1995), which does not need an initial lexicon but is computationally prohibitively expensive.\n","found it!\n","printing last sentence\n","Text in parallel translation is a valuable resource in natural language processing.\n","found it!\n","printing last sentence\n","In the ambiguity-preserving translation framework, a model like this one could be used to choose between sets of analyses whose ambiguities cannot be preserved in translation.\n","found it!\n","printing last sentence\n","Information overload has created an acute need for summarization.\n","found it!\n","printing last sentence\n","In this paper, we described the TempEval-2 task within the SemEval 2010 competition.\n","found it!\n","printing last sentence\n","We presented the description, evaluation framework and assessment of systems participating in the SemEval-2010 sense induction task.\n","found it!\n","printing last sentence\n","The authors would also like to acknowledge Giovanni Moretti from CELCT for evaluation scripts and technical assistance, and the volunteer translators that contributed to the creation of the dataset: This work has been partially supported by the ECfunded project CoSyne (FP7-ICT-4-24853).\n","printing last sentence\n","We would also like to thank Carl Sable, Min-Yen Kan, Dave Evans, Adam Budzikowski, and Veronika Horvath for their help with the evaluation.\n","found it!\n","printing last sentence\n","In current work, we are examining how to combine these two approaches.\n","found it!\n","printing last sentence\n","The work of Chater and Finch can be seen as similar to the work presented here given an independence assumption.\n","found it!\n","printing last sentence\n","It obtained an Fo=1 score of 93.48 on this task.\n","found it!\n","printing last sentence\n","In this paper, we explore the use of Support Vector Machines (SVMs) for CoNLL-2000 shared task, chunk identification.\n","found it!\n","printing last sentence\n","There is no question that a great deal of care and expertise went into creating the Chinese Treebank, and that it is a source of important grammatical information that is unique to the Chinese language.\n","found it!\n","printing last sentence\n","This paper proposes Japanese dependency analysis based on Support Vector Machines.\n","found it!\n","printing last sentence\n","Even when the accuracy figures for corpus-based part-of-speech taggers start to look extremely similar, it is still possible to move performance levels up.\n","found it!\n","printing last sentence\n","This is indeed the case: the -results are summarized in Table 4.\n","found it!\n","printing last sentence\n","Most approaches to natural language generation (NLG) ignore morphological variation during word choice, postponing the computation of the actual word forms to be output to a final stage, sometimes termed clinearisation'.\n","printing last sentence\n","Section 5 contains an evaluation of co-training for base noun identification.\n","found it!\n","printing last sentence\n","We would like to thank Nu Lai for help with the classification of the noun compound relations.\n","found it!\n","printing last sentence\n","This suggests that in a language with MWUs, we do show modest performance gains.\n","found it!\n","printing last sentence\n","Finally, a LSA procedure for computing document specific similarity values will be evaluated.\n","found it!\n","printing last sentence\n","Our results show strong corpus effects for statistical parsing models: a small amount of matched training data appears to be more useful than a large amount of unmatched data.\n","found it!\n","printing last sentence\n","The result of the work is a prototype program which takes as input set of news stories broken into separate sentences and produces as output a text that combines all the events from all the articles, organized in chronological order.\n","found it!\n","printing last sentence\n","The advent of large-scale collections of annotated data has marked a paradigm shift in the research community for natural language processing.\n","found it!\n","printing last sentence\n","We are grateful to Mitch Marcus and the Department of Computer and Information Science at the University of Pennsylvania for sponsoring the work reported here.\n","found it!\n","printing last sentence\n","We have described the use of Support Vector Machines for the biomedical named entity recognition task.\n","found it!\n","printing last sentence\n","In this paper, we describe Arabic-to-English name transliteration system using probabilistic finite state machines2 that address both the transliteration of Arab and foreign names into English.\n","found it!\n","printing last sentence\n","This is why two one letter morphs appear in a sequence in the segmentation el¨ain + tarh + a + n.) In Section 5, we compare the results obtained from our methods to results produced by Goldsmith’s Linguistica on the same data.\n","printing last sentence\n","We expect this rate to gradually increase as the site becomes more widely known and receives more traffic.\n","found it!\n","printing last sentence\n","These may be learned using the described methods.\n","found it!\n","printing last sentence\n","In these experiments we have proposed new measure and weight functions that, as our evaluation has shown, significantly outperform existing similarity functions.\n","found it!\n","printing last sentence\n","In a trigram tagger the score for a tagged sequence t [1:n]paired with a word se quence w [1:n] is 2 P n i=1 t i See (Collins and Duy 2001; Collinsand Duy 2002; Collins 2002) for other applica tions of the voted perceptron to NLP problems.\n","printing last sentence\n","Based on our experimental results, there appears to be no single, universally best knowledge source.\n","found it!\n","printing last sentence\n","Today, very large amounts of information are available in on-line documents.\n","found it!\n","printing last sentence\n","The main shortcoming of the phrase-based model in this paper concerns the size of the t-table and the cost of the training procedure we currently apply.\n","found it!\n","printing last sentence\n","For the future, we plan the application of refined translation and language models for rescoring on word graphs.\n","found it!\n","printing last sentence\n","Finally, we present results showing that learning multiple semantic categories simultaneously improves performance.\n","found it!\n","printing last sentence\n","The work reported here was supported in part by the Defense Advanced Research Projects Agency under contract number N66001-00-C-8008.\n","found it!\n","printing last sentence\n","Initial evaluation of the grammar on new domains and the growth curve of grammar coverage should bear this out.\n","found it!\n","printing last sentence\n","More importantly, the grammar matrix will help to remove one of the primary remaining obstacles to commercial deployment of grammars of this type and indeed of the commercial use of deep linguistic analysis: the immense cost of developing the resource.\n","found it!\n","printing last sentence\n","The experiences of the ParGram grammar writers has shown that the parallelism of analysis and implementation in the ParGram project aids further grammar development efforts.\n","found it!\n","printing last sentence\n","In addition, we showed that dynamic features significantly contribute to improve the performance.\n","found it!\n","printing last sentence\n","In this paper, we have described experiments comparing the performance of a number of different algorithms for estimating the parameters of a conditional ME model.\n","found it!\n","printing last sentence\n","Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n","found it!\n","printing last sentence\n","process of intra-family translation was handled by weighted string distance models of cognate similarity with a probabilistic representation of common intrafamily orthographic transformations.\n","found it!\n","printing last sentence\n","There are many people who contributed greatly to making this word alignment evaluation task possible.\n","found it!\n","printing last sentence\n","Subjectivity is a complex linguistic phenomenon and our evidence suggests that reliable subjectivity classification requires a broad array of features.\n","found it!\n","printing last sentence\n","First as a useful visualization tool themselves, and second as seeds for disambiguating further entities.\n","found it!\n","printing last sentence\n","We have shown that co-training is an effective technique for bootstrapping POS taggers trained on small amounts of labelled data.\n","found it!\n","printing last sentence\n","Named entities are phrases that contain the names of persons, organizations and locations.\n","found it!\n","printing last sentence\n","Our NER system demonstrates that using a large variety of features produces good performance.\n","found it!\n","printing last sentence\n","As a machine learning method, the RRM algorithm seems especially suited to handle additional feature streams, and therefore is a good candidate for classifier combination.\n","found it!\n","printing last sentence\n","This paper also again demonstrates how the ease of incorporating features into a discriminative maxent model allows for productive feature engineering.\n","found it!\n","printing last sentence\n","German F1 using very limited lexicons is 68.11%.\n","found it!\n","printing last sentence\n","This system will be evaluated in upcoming iCLEF conferences.\n","found it!\n","printing last sentence\n","Syntax mediates between surface word order and meaning.\n","found it!\n","printing last sentence\n","Correctly identifying the semantic roles of sentence constituents is a crucial part of interpreting text, and in addition to forming an important part of the information extraction problem, can serve as an intermediate step in machine translation or automatic summarization.\n","found it!\n","printing last sentence\n","We propose to investigate other models such as the probabilistic one given in Section 2.3.\n","found it!\n","printing last sentence\n","Finally, Section 5 summarizes our findings and conclusions.\n","found it!\n","printing last sentence\n","Any opinions, findings, or recommendations are those of the authors and do not necessarily reflect ARDA’s views.\n","printing last sentence\n","In this paper I have shown how keyword extraction from abstracts can be achieved by using simple statistical measures as well as syntactic information from the documents, as input to a machine learning algorithm.\n","found it!\n","printing last sentence\n","We also investigate incorporation of this transliteration system in a cross-lingual spoken document retrieval application, in which English text queries are used to index and retrieve Mandarin audio from the TDT corpus.\n","found it!\n","printing last sentence\n","We feel that this First International Chinese Word Segmentation Bakeoff has been useful in that it has provided us with a good sense of the range of performance of various systems, both from academic and industrial institutions.\n","found it!\n","printing last sentence\n","Its high accuracy on makes it a good candidate as a general purpose segmenter.\n","found it!\n","printing last sentence\n","We look forward to participate forthcoming bakeoff.\n","found it!\n","printing last sentence\n","This paper has described the implementation and evaluation of four corpus-based approaches to the semantics of verb-particle constructions.\n","found it!\n","printing last sentence\n","Many people are working on acquisition of multiword expressions, although terminology varies.\n","found it!\n","printing last sentence\n","We evaluated the method over English NN compounds and verbparticles, and showed it to correlate moderately with WordNet-based hyponymy values.\n","found it!\n","printing last sentence\n","In this paper, we have analyzed the potential for incremental processing in deterministic dependency parsing.\n","found it!\n","printing last sentence\n","Overall, the results achieved in this SENSEVAL-3 task were quite high.\n","found it!\n","printing last sentence\n","Many thanks to all those who contributed to the Open Mind Word Expert project, making this task possible.\n","found it!\n","printing last sentence\n","the glosses.\n","found it!\n","printing last sentence\n","In this paper, we introduced ROUGE, an automatic evaluation package for summarization, and conducted comprehensive evaluations of the automatic measures included in the ROUGE package using three years of DUC data.\n","found it!\n","printing last sentence\n","Experiments were conducted on a training and evaluation set provided by the task organizers.\n","found it!\n","printing last sentence\n","We describe a new corpus of over 180,000 handannotated dialog act tags and accompanying adjacency pair annotations for roughly 72 hours of speech from 75 naturally-occurring meetings.\n","found it!\n","printing last sentence\n","Natural language decisions often depend on the outcomes of several different but mutually dependent predictions.\n","found it!\n","printing last sentence\n","This research is supported by a National Science Foundation Faculty Early CAREER Development Award (#0092784).\n","found it!\n","printing last sentence\n","We are grateful to three anonymous reviewers for constructive com ments on the preliminary version of the paper.\n","found it!\n","printing last sentence\n","(Rosario and Hearst 2001) focused on the medical domain making use of a lexical ontology and standard machine learning techniques.\n","found it!\n","printing last sentence\n","In particular, the use of PropBank’s annotation tool and frame files proved invaluable to our effort.\n","printing last sentence\n","Finally, we would like to thank the anonymous workshop reviewers for their comments.\n","found it!\n","printing last sentence\n","We have described here an integrated annotation approach for two areas of biomedical information extraction.\n","found it!\n","printing last sentence\n","This trade-off between the complexity, accuracy and efficiency of a parsing model is an important area of future research.\n","found it!\n","printing last sentence\n","The authors wish to thank the reviewers for their helpful comments and Google Inc.\n","found it!\n","printing last sentence\n","In future work we also plan to find the valid contexts for entailment relations.\n","found it!\n","printing last sentence\n","We close by reviewing prior work in areas related to this paper (§5).\n","printing last sentence\n","Previous work on extracting bilingual or monolingual sentence pairs from comparable corpora has only been applied to documents that are within the same topic, or have very similar publication dates.\n","found it!\n","printing last sentence\n","Semantic role tagging is thus an one of N classification task.\n","found it!\n","printing last sentence\n","We gratefully acknowledge the support of NSERC of Canada.\n","found it!\n","printing last sentence\n","We presented a novel approach to the problem of generating sentence-level paraphrases in a broad semantic domain.\n","found it!\n","printing last sentence\n","Finally, we discuss experimental results (Section 4) and give conclusions with possible future directions (Section 5).\n","found it!\n","printing last sentence\n","Language differences between English and Chinese have made direct porting of an English POS tagging method to Chinese ineffective.\n","found it!\n","printing last sentence\n","Automatic capitalization is a practically relevant problem: speech recognition output needs to be capitalized; also, modern word processors perform capitalization among other text proofing algorithms such as spelling correction and grammar checking.\n","found it!\n","printing last sentence\n","Two experiments on the opinion and modality classification tasks are employed to confirm that subtree features are important.\n","found it!\n","printing last sentence\n","Text summarization is the process of automatically creating a compressed version of a given text that provides useful information for the user.\n","found it!\n","printing last sentence\n","Having a trusted experimental framework is essential for drawing conclusions on the effects of system changes.\n","found it!\n","printing last sentence\n","An important aspect of TextRank is that it does not require deep linguistic knowledge, nor domain or language specific annotated corpora, which makes it highly portable to other domains, genres, or languages.\n","found it!\n","printing last sentence\n","Recently an increasing amount of research has been devoted to investigating methods of recognizing favorable and unfavorable sentiments towards specific subjects within natural language texts.\n","found it!\n","printing last sentence\n","5.1 Collins Head-Driven Model 2.\n","found it!\n","printing last sentence\n","The constraints are encoded as the followings.\n","found it!\n","printing last sentence\n","This paper introduces several syntax-based metrics for the evaluation of MT, which we find to be particularly useful for predicting a hypothesis’s fluency.\n","printing last sentence\n","High-levels of correlation at the segment level are important because they are likely to yield a metric that is sensitive to minor differences between systems and to minor differences between different versions of the same system* Furthermore, current levels of correlation at the sentence level are still rather low, offering a very significant space for improvement* The results reported in this paper demonstrate that all of the individual components included within METEOR contribute to improved correlation with human judgments* In particular, METEOR is shown to have statistically significant better correlation compared to unigram-precision, unigramrecall and the harmonic FI combination of the two* We are currently in the process of exploring several further enhancements to the current METEOR metric, which we believe have the potential to significantly further improve the sensitivity of the metric and its level of correlation with human judgments* Our work on these directions is described in further detail in Section 4* ing is then also used in order to calculate an aggregate score for the MT system over the entire test set* Section 2 describes the metric in detail, and provides a full example of the matching and scoring* In previous work (Lavie et al*, 2004), we compared METEOR with IBM's BLEU metric and it's derived NIST metric, using several empirical evaluation methods that have been proposed in the recent literature as concrete means to assess the level of correlation of automatic metrics and human judgments* We demonstrated that METEOR has significantly improved correlation with human judgments* Furthermore, our results demonstrated that recall plays a more important role than precision in obtaining high-levels of correlation with human judgments* The previous analysis focused on correlation with human judgments at the system level* In this paper, we focus our attention on improving correlation between METEOR score and human judgments at the segment level.\n","printing last sentence\n","Future work will consider the investigation of more sophisticated representations of sentence structure, such as first order predicate logic or semantic parse trees, which should allow for the implementation of more effective measures of text semantic similarity.\n","found it!\n","printing last sentence\n","Model 2 (Collins, 2003), and to a synchronous CFG based machine translation system (Chiang, 2005).\n","found it!\n","printing last sentence\n","Additionally, we plan to investigate the use of the beam strategy of Ratnaparkhi (1997) to pursue multiple parses while keeping the run-time linear.\n","found it!\n","printing last sentence\n","This paper presented a methodology to identify an opinion with its holder and topic given a sentence in online news media texts.\n","found it!\n","printing last sentence\n","To summarize, in order to classify an MWE as non-compositional, we compute an approximation of its compositional meaning and compare this with the meaning of the expression as it is used on the whole.\n","found it!\n","printing last sentence\n","The SPMT models are similar to the models proposed by Chiang (2005) and Galley et al.\n","found it!\n","printing last sentence\n","Smoothing is an important technique in statistical NLP, used to deal with perennial data sparseness and empirical distributions that overfit the training corpus.\n","found it!\n","printing last sentence\n","Discriminative learning methods are ubiquitous in natural language processing.\n","found it!\n","printing last sentence\n","Many inference algorithms require models to make strong assumptions of conditional independence between variables.\n","found it!\n","printing last sentence\n","Any opinions, findings, and conclusions or recommendations expressed are those of the authors and do not necessarily reflect the views or official policies, either expressed or implied, of any sponsoring institutions, the U.S.\n","found it!\n","printing last sentence\n","Experimental results are shown in Section 6, and we conclude in Section 7.\n","found it!\n","printing last sentence\n","In addition, we achieve an F-measure of 68.9 for link relationidentification and 82.0 for opinion expression ex traction; for the latter task, our system achieves human-level performance.2 This paper presented a global inference approachto jointly extract entities and relations in the con text of opinion oriented information extraction.\n","printing last sentence\n","(Ciaramita et al., 2005)).\n","found it!\n","printing last sentence\n","This research was partially supported by a National Science Foundation Faculty Early CAREER Development Award (#0092784).\n","found it!\n","printing last sentence\n","In this paper we investigate a new problem of automatically identifying the perspective from which a document is written.\n","found it!\n","printing last sentence\n","There are many directions for interesting research building on the work done in this shared task.\n","found it!\n","printing last sentence\n","The parser does not attempt to assign a dependency relation to the root.\n","found it!\n","printing last sentence\n","It is our hope that a better morphological feature set will help with both unlabeled parsing and labeling for highly inflected languages.\n","found it!\n","printing last sentence\n","We are grateful for the support from T ¨UBË™ITAK (The Scientific and Technical Research Council of Turkey) and the Swedish Research Council.\n","printing last sentence\n","Finally, we demonstrate that interpolation of the two estimates can provide a modest increase in BLEU score over the heuristic baseline.\n","found it!\n","printing last sentence\n","In recent evaluations, phrase-based statistical machine translation systems have achieved good performance.\n","found it!\n","printing last sentence\n","was done by the participants.\n","found it!\n","printing last sentence\n","In this work we applied syntax based resources (the target language parser) to annotate and generalize phrase translation tables extracted via existing phrase extraction techniques.\n","found it!\n","printing last sentence\n","This paper presents an adaptation of the classic syntax-directed translation with linguisticallymotivated formalisms for statistical MT.\n","found it!\n","printing last sentence\n","This paper contains three contributions: Workshop on TextGraphs, at HLT-NAACL 2006, pages 45–52, New York City, June 2006.\n","printing last sentence\n","Clustering is the process of grouping together objects based on their similarity to each other.\n","found it!\n","printing last sentence\n","In this section, we first verify the effectiveness of fixed-link pruning, and then test our phrasal ITG, both as an aligner and as a translation model.\n","found it!\n","printing last sentence\n","ability to guide translation would be enhanced if the constraints encoded in the tags were to be enforced using combinatory operators.\n","found it!\n","printing last sentence\n","We have investigated a number of approaches to mixture-based adaptation using genres for Chinese to English translation.\n","found it!\n","printing last sentence\n","For more on the participating systems, please refer to the respective system description in the proceedings of the workshop.\n","found it!\n","printing last sentence\n","HR0011-06-C-0022 and in part under the EuroMatrix project funded by the European Commission (6th Framework Programme).\n","found it!\n","printing last sentence\n","In this paper we described newly developed language-specific instances of the METEOR metric and the process of optimizing metric parameters for different human measures of translation quality and for different languages.\n","found it!\n","printing last sentence\n","The following sources were used in the preparation of the data: http://www1.cs.columbia.edu/~ani/DUC2005/ We would like to thank the people and organizations that made these sources available for the challenge.\n","found it!\n","printing last sentence\n","The paper is structured as follows: in the next section, we describe the difficulty in learning English preposition usage; in Section 3, we discuss related work; in Sections 4-7 we discuss our methodology and evaluation.\n","found it!\n","printing last sentence\n","Finally, Section 5 draws the conclusions.\n","found it!\n","printing last sentence\n","It is commonly thought that one of the major obstacles to high-performance Word Sense Disambiguation (WSD) is the fine granularity of sense inventories.\n","found it!\n","printing last sentence\n","Indeed, since the systems in SemEval did not know the candidate substitutes for a word before hand, the lexical resource is evaluatedas much as the context based disambiguation com ponent.\n","found it!\n","printing last sentence\n","Finally, Section 4 presents some con clusions.\n","found it!\n","printing last sentence\n","EvaluationThe evaluation approach of TempEval avoids the in terdependencies that are inherent to a network of temporal relations, where relations in one part of the network may constrain relations in any other part ofthe network.\n","found it!\n","printing last sentence\n","Correctly disambiguating words (WSD), and correctly identifying the semantic relationships be tween those words (SRL), is an important step forbuilding successful natural language processing applications, such as text summarization, question an swering, and machine translation.\n","found it!\n","printing last sentence\n","The testing data for this task turned out to be espe cially challenging with regard to new frames, since, in an effort to annotate especially thoroughly, almost 10340 new frames were created in the process of an notating these three specific passages.\n","found it!\n","printing last sentence\n","Thanks to Ben Taskar for pointing out the work of Meil˘a and Jaakkola (2000).\n","printing last sentence\n","For more on the participating systems, please refer to the respective system descriptions in the proceedings of the workshop.\n","found it!\n","printing last sentence\n","The authors would like to thank Menqgiu Wang and Huihsin Tseng for useful discussions.\n","found it!\n","printing last sentence\n","Finally, conclusion and future work are presented in section 6.\n","found it!\n","printing last sentence\n","We also thank the workshop reviewers for their helpful comments.\n","found it!\n","printing last sentence\n","The following ideas are central to our approach: Thanks to Jenny Rose Finkel for suggesting that we evaluate dependency parsing accuracies.\n","printing last sentence\n","Section 7 concludes the paper.\n","found it!\n","printing last sentence\n","CoNLL 2008: Proceedings of the 12th Conference on Computational Natural Language Learning, pages 183?187 Manchester, August 2008 Dependency-based Syntactic?Semantic Analysis with PropBank and NomBank Richard Johansson and Pierre Nugues Lund University, Sweden {richard, pierre}@cs.lth.se Abstract This paper presents our contribution in the closed track of the 2008 CoNLL Shared Task (Surdeanu et al., 2008).\n","found it!\n","printing last sentence\n","All of the data, translations, and human judgments produced for our workshop are publicly available.1 We hope they form a valuable resource for research into statistical machine translation, system combination, and automatic evaluation of translation quality.\n","found it!\n","printing last sentence\n","We hope the release of the toolkit will greatly contribute the progress of the syntax-based machine translation research.' Large scale parsing-based statistical machine translation (e.g., Chiang (2007), Quirk et al.\n","printing last sentence\n","A well-known problem of Statistical Machine Translation (SMT) is that performance quickly degrades as soon as testing conditions deviate from training conditions.\n","found it!\n","printing last sentence\n","This work was supported, in part, by BBN Technologies under the GALE Program, DARPA/IPTO Contract No.\n","found it!\n","printing last sentence\n","We are grateful to four anonymous reviewers for their valuable comments and suggestions.\n","found it!\n","printing last sentence\n","Extracts 20,176 titles and 15,182 redirects.\n","found it!\n","printing last sentence\n","We are thankful to three anonymous reviewers for their valuable comments.\n","found it!\n","printing last sentence\n","The history of text mining (TM) shows that shared tasks based on carefully curated resources, such as those organized in the MUC (Chinchor, 1998), TREC (Voorhees, 2007) and ACE (Strassel et al., 2008) events, have significantly contributed to the progress of their respective fields.\n","found it!\n","printing last sentence\n","Complex emotions can be viewed as combinations of these basic emotions.\n","found it!\n","printing last sentence\n","All of the shared task data is available on the workshop website.\n","found it!\n","printing last sentence\n","As with past years, all of the data, translations, and human judgments produced for our workshop are publicly available.2 We hope they form a valuable resource for research into statistical machine translation, system combination, and automatic evaluation of translation quality.\n","found it!\n","printing last sentence\n","This paper proposed a novel method to model the compositionality of meaning in distributional models of semantics.\n","found it!\n","printing last sentence\n","Semantic Parsing, the process of converting text into a formal meaning representation (MR), is one of the key challenges in natural language processing.\n","found it!\n","printing last sentence\n","The latter task falls within the scope of semantic analysis of sentences exploiting syntactic patterns, as hedge spans can usually be determined on the basis of syntactic patterns dependent on the keyword.\n","found it!\n","printing last sentence\n","Microblogging websites have evolved to become a source of varied kind of information.\n","found it!\n","printing last sentence\n","There have been ongoing efforts since BioNLP-ST 2009 to develop IE systems based on the task resources, and we hope to see continued efforts also following BioNLP-ST 2011, especially exploring the use of supporting task resources for main tasks.\n","found it!\n","printing last sentence\n","This paper presents the task setup, preparation, and discusses the results.\n","found it!\n","printing last sentence\n","Finally, we offer our special thanks to Llufs M`arquez and Joakim Nivre for their wonderful support and guidance without which this task would not have been successful.\n","found it!\n","printing last sentence\n","We also report in this section our official results in the testing partition.\n","found it!\n","printing last sentence\n","Tables 39–48 give the automatic scores for each of the systems.\n","printing last sentence\n","The source code and all resources for Meteor 1.3 and the version of Z-MERT with Meteor integration will be available for download from the Meteor website.\n","found it!\n","printing last sentence\n","Language models are widely applied in natural language processing, and applications such as machine translation make very frequent queries.\n","found it!\n","printing last sentence\n","We therefore categorize all commercial systems as unconstrained when evaluating the results.\n","found it!\n","printing last sentence\n","The two steps are described in the following section.\n","found it!\n","printing last sentence\n","After all, as Woods [1975] has pointed out, while descriptive analyses of language can at best tell us what the brain does, engineering analyses can potentially offer insights on why the brain functions as it does.\n","found it!\n","printing last sentence\n","The rule-based tagger is based on a learning algorithm called transformation-based errordriven learning.\n","found it!\n","printing last sentence\n","The backed-off estimate scores appreciably better than other methods which have been tested on the Wall Street Journal corpus.\n","found it!\n","printing last sentence\n","The final section draws some conclusions.\n","found it!\n","printing last sentence\n","This paper represents a step toward getting as much leverage as possible out of work within that paradigm, and then using it to help determine relationships among word senses, which is really where the action is.\n","found it!\n","printing last sentence\n","'Note that this is one of the cases where Church's chunker allows separate NP fragments to count as chunks.\n","printing last sentence\n","A machine translation system must be able to choose among possible translations based on context.\n","found it!\n","printing last sentence\n","In this paper we show that a heuristic case base compression formalism (Daelemans et al., 1996), makes the memory-based approach computationally attractive.\n","found it!\n","printing last sentence\n","Nearest neighbor grows linearly with the number of training instances as expected; more sophisticated indexing methods can reduce this to logarithmic expected time (Friedman, Bentley, & Finkel, 1977).5 Recent research in empirical (corpus-based) natural language processing has explored a number of different methods for learning from data.\n","printing last sentence\n","The Maximum Entropy model is an extremely flexible technique for linguistic modelling, since it can use a virtually unrestricted and rich feature set in the framework of a probability model.\n","found it!\n","printing last sentence\n","The Data-Oriented Parsing (DOP) model has a short, interesting, and controversial history.\n","found it!\n","printing last sentence\n","The most computationally expensive part of the system is the word sense disambiguation of the training corpus.\n","found it!\n","printing last sentence\n","The information can be used in language modeling in addition to the currently popular N-gram models and word trigger pairs.\n","found it!\n","printing last sentence\n","It has long been observed that selectional constraints and word sense disambiguation are closely linked.\n","found it!\n","printing last sentence\n","Lastly, this paper clearly demonstrates that schemes for reranking the top 20 parses deserve research effort since they could yield vastly better accuracy results.\n","found it!\n","printing last sentence\n","In this paper, we examine thresholding techniques for statistical parsers.\n","found it!\n","printing last sentence\n","The optimal way to analyze linguistic data into its primitive elements is rarely obvious but often crucial.\n","found it!\n","printing last sentence\n","Semantic information can be helpful in almost all aspects of natural language understanding, including word sense disambiguation, selectional restrictions, attachment decisions, and discourse processing.\n","found it!\n","printing last sentence\n","In future work, we will investigate modifications of these algorithms and feature set selection that are more effective on highly skewed sense distributions.\n","found it!\n","printing last sentence\n","Using Lexical Chains for Text Summarization Reg ina Barz i lay Mathematics and Computer S~nence Dept Ben Gunon University m the Negev Beer-Sheva, 84105 Israel regana@cs.bEu ac.\n","found it!\n","printing last sentence\n","Researchers in computational linguistics (Mann and Thompson, 1988, Matthiessen and Thompson, 1988, Sparck Jones, 1993) have long speculated that the nuclei that pertain to a rhetorical structure tree (RS-tree) (Mann and Thompson, 1988) constitute an adequate summanzation of the text for which that RS-tree was built However, to our knowledge, there was no experiment to confirm how valid this speculation really is In what follows, we describe an experiment that shows that there exists a strong correlation between the nuclei of the RS-tree of a text and what readers perceive to be the most important units in a text We know from the results reported in the psychological literature on summarization (Johnson, 1970, Chou Hare and Borchardt, 1984, Sherrard, 1989) that there exists a certain degree of disagreement between readers with respect to the importance that they assign to various textual units and that the disagreement is dependent on the quality of the text and the comprehension and summarization skills of the readers (Winograd.\n","found it!\n","printing last sentence\n","GermaNet is a broad-coverage lexical-semantic net for German which currently contains some 16.000 words and aims at modeling at least the base vocabulary of German.\n","found it!\n","printing last sentence\n","Also the comments of two anonymous reviewers proved quite helpful.\n","found it!\n","printing last sentence\n","We are indebted to Renee Pohlmann for giving us good pointers at an early stage of this work, and to AnseImo Peilas and David Fernandez for their help finishing up the test collection.\n","found it!\n","printing last sentence\n","Conceptual natural language processing typically involves case frame instantiation to recognize events and role objects in text.\n","found it!\n","printing last sentence\n","Rather than the thousands of edges required by C&C, the parser presented here requires hundreds, or even, if one is willing to pay a small price in accuracy, tens.\n","printing last sentence\n","Named entity recognition is one of the simplest of the common message understanding tasks.\n","found it!\n","printing last sentence\n","This too is a topic for future research.\n","found it!\n","printing last sentence\n","It is necessary to be careful in evaluating these results, which are only as good as the evaluation function.\n","found it!\n","printing last sentence\n","Our extension of WordNet intends to serve as a lexico-semantic resource for a variety of NLP applications, many of them requiring pragmatic and common-sense knowledge (Harabagm and Moldovan 1998) It is beneficial to transform the conceptual glosses in logical formulae Approach to implement Logical Form Transformations (LFTs) (1) Traditional lexicographic principles determine the discrimination of any conceptual definitions into a genus and the differentia Our LFTs implement the same distinction by always placing the genus predicate on the first position of the LFT, and the rest of the LFT viewed as the definition differentia In the case when the subject or the object are present in the gloss, they share the corresponding arguments with the action/state/event predicate For example, the LFT of (a person who backs a politician) the gloss of {supporter, protagonist, champion, admirer, booster, friend} is LFT = [person n#1(2,1) Sz back v#1(e1,114)) politician n#2(x2) (4) The role of complements within a phrase is replicated in the LFTs Predicates geneiated from modifiers share the same arguments with the predicates corresponding to the phrase heads Adjective piedicates share the same argument as the predicate corresponding to the noun they modify An exemplification is the LFT of the gloss of {art if act , artefact}, which maps (a man-made object) into [ object n#1(xi) Sc man-made a#1(x1)] Similarly, the argument of adverbial predicate is the argument marking the eventuality of the event/state/action they modify For example, the gloss of the verb synset {hare} is (run quickly), producing the LFT = [run(ei,a,i,x2) & quickly(e")] under the same syntactic role (e g subject, object or prepositional object) By convention, conjunctionpredicates have a variable number of arguments, since they cover a variable number of predicates The first argument represents the "result" of the logical operation induced by the conjunction (e g a logical and in the case of the and conjunction, or a logical or in the case of the or conjunction) The rest of the aiguments indicate the predicates covered by the conjunction, as they are aiguments of those predicates as well (6) We also geneiate 'medicates for every preposition encountered in the gloss The preposition predicates always have two arguments the first argument corresponding to the predicate of the head of the phi ase to which prepositional phi ase is attached, whereas the second argument corresponds to the prepositional object Sources of information.\n","printing last sentence\n","The goal of machine translation is the translation of a text given in some source language into a target language.\n","found it!\n","printing last sentence\n","We would like to thank David Pierce for his formatting and technical advice.\n","found it!\n","printing last sentence\n","The ability to determine the named entities in a text has been established as an important task for several natural language processing areas, including information retrieval, machine translation, information extraction and language understanding.\n","found it!\n","printing last sentence\n","Unlabeled examples in the named-entity classification problem can reduce the need for supervision to a handful of seed rules.\n","found it!\n","printing last sentence\n","We have presented two general approaches to studying parser combination: parser switching and parse hybridization.\n","found it!\n","printing last sentence\n","Our new features, and especially the composite ones, are shown to outperform traditional techniques such as TF*IDF [Buckley 1985; Salton 1989] for determining similarity over small text units.\n","found it!\n","printing last sentence\n","In this paper we studied cascaded grammatical relations assignment.\n","found it!\n","\n"]}],"source":["# Function to extract the last sentence from a text\n","\n","file_path = r'/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/summaries_pacsum_Scisummnet_dataset_final.xlsx'\n","df = pd.read_excel(file_path)\n","def get_last_sentence(text):\n"," sentences = re.split(r'(?<=[.!?]) +', text)\n"," return sentences[0].strip() if sentences else ''\n","\n","# Process each row in the DataFrame\n","for index, row in df.iterrows():\n"," summary_text = row['summary_text']\n"," last_sentence = get_last_sentence(summary_text)\n"," print(\"printing last sentence\")\n"," print(last_sentence)\n","\n"," # Find the paper_index where the last sentence is a substring of the XML content\n"," found_index = None\n"," for paper_index, xml_content in files.items():\n"," if last_sentence in xml_content:\n"," found_index = paper_index\n"," print(\"found it!\")\n"," break\n","\n"," # Update the DataFrame\n"," df.at[index, 'paper_index'] = found_index\n","print(df.head)\n","\n","# Save the updated DataFrame to an Excel file\n","df.to_excel('updated_file_pacsum.xlsx', index=False)\n","\n","\n"," # print('CORRESPONDING SUMMARY')\n"," #print(df['summary_text'][index])\n"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"AoV7IQyrvkCS","outputId":"259f7506-0d9c-4834-9f64-f164ba075da3","executionInfo":{"status":"ok","timestamp":1719059027146,"user_tz":-240,"elapsed":1869,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Excel file '/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/updated_file_pacsum.xlsx' has been updated.\n"]}],"source":["import pandas as pd\n","import re\n","\n","# Define the file path\n","file_path = r'/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/updated_file_pacsum.xlsx'\n","\n","# Read the Excel file\n","df = pd.read_excel(file_path)\n","\n","\n","# Function to extract the first sentence from a text\n","def get_first_sentence(text):\n"," sentences = re.split(r'(?<=[.!?]) +', text)\n"," return sentences[0].strip() if sentences else ''\n","\n","# Process each row in the DataFrame\n","for index, row in df.iterrows():\n"," # Check if paper_index is empty\n"," if pd.isna(row['paper_index']):\n"," summary_text = row['summary_text']\n"," first_sentence = get_first_sentence(summary_text)\n","\n"," # Find the paper_index where the first sentence is a substring of the XML content\n"," found_index = None\n"," for paper_index, xml_content in files.items():\n"," if first_sentence in xml_content:\n"," found_index = paper_index\n"," break\n","\n"," # Update the DataFrame\n"," if found_index is not None:\n"," df.at[index, 'paper_index'] = found_index\n","\n","# Save the updated DataFrame to the same Excel file\n","df.to_excel(file_path, index=False)\n","\n","print(f\"Excel file '{file_path}' has been updated.\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"appJc0hN7j6a"},"outputs":[],"source":["import os\n","import pandas as pd\n","import re\n","from bs4 import BeautifulSoup\n","\n","# Define file paths\n","excel_file_path = r'/content/drive/MyDrive/Extractive_summarization/HIPORank/updated_file.xlsx'\n","base_dir_path = r'/content/drive/MyDrive/Extractive_summarization/scisummnet_final_dataset/top1000_complete'\n","output_dir_path = r'/content/drive/MyDrive/Extractive_summarization/HIPORank/dataset/inputs'\n","\n","# Read the Excel file\n","df = pd.read_excel(excel_file_path)"]},{"cell_type":"code","execution_count":18,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0vmhjzPQ0vBH","outputId":"e0e1c9f6-b103-4030-e27a-10270b8ebeca","executionInfo":{"status":"ok","timestamp":1719063256037,"user_tz":-240,"elapsed":1639,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Excel file '/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/updated_file_pacsum.xlsx' has been updated.\n"]}],"source":["import pandas as pd\n","\n","# Define the file path\n","file_path = r'/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/updated_file_pacsum.xlsx'\n","\n","# Read the Excel file\n","df = pd.read_excel(file_path)\n","\n","# Function to remove '.xml' from the end of the paper_index\n","def remove_xml_extension(paper_index):\n"," if isinstance(paper_index, str) and paper_index.lower().endswith('.xml'):\n"," return paper_index[:-4] # Remove the last 4 characters ('.xml')\n"," return paper_index\n","\n","# Apply the function to the paper_index column\n","df['paper_index'] = df['paper_index'].apply(remove_xml_extension)\n","\n","# Save the updated DataFrame to the same Excel file\n","df.to_excel(file_path, index=False)\n","\n","print(f\"Excel file '{file_path}' has been updated.\")\n"]},{"cell_type":"code","execution_count":19,"metadata":{"colab":{"base_uri":"https://localhost:8080/","output_embedded_package_id":"1fKG_1xMYs_HL7MKa0WAB3MtAj2DyQvaW"},"id":"YCCJOcK2Q8V7","outputId":"553b9983-2e26-4e6c-f9f2-97d26cbe66c0","executionInfo":{"status":"ok","timestamp":1719066820515,"user_tz":-240,"elapsed":34127,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}],"source":["import os\n","import pandas as pd\n","import re\n","from bs4 import BeautifulSoup\n","\n","# Define file paths\n","excel_file_path = r'/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/updated_file_pacsum.xlsx'\n","base_dir_path = r'/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/scisummnet_final_dataset/top1000_complete'\n","output_dir_path = r'/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/dataset/inputs'\n","\n","# Read the Excel file\n","df = pd.read_excel(excel_file_path)\n","\n","# Function to extract and concatenate sentences from the XML file\n","def extract_sentences_from_xml(xml_content):\n"," try:\n"," # Parse the XML content\n"," soup = BeautifulSoup(xml_content, 'xml')\n","\n"," # Extract sentences from the first section\n"," sections = soup.find_all('SECTION')\n"," print(len(sections))\n"," if len(sections)>0:\n"," first_section = sections[0]\n"," sentences = first_section.find_all('S')\n"," concatenated_sentences = ' '.join(sentence.get_text() for sentence in sentences)\n","\n"," else:\n"," raise ValueError(\"No sections found\")\n"," sentences = soup.find_all('S')[5:20] #assuming first 8 lines for abstract\n"," concatenated_sentences = ' '.join(sentence.get_text() for sentence in sentences)\n","\n"," except:\n"," # Extract the first 15 sentences if no sections are found (EXCLUDING FIRST 5 ASSUME THEY BELONG TO ABSTRACT)\n"," sentences = soup.find_all('S')[5:20]\n"," concatenated_sentences = ' '.join(sentence.get_text() for sentence in sentences)\n"," print(\"printing introduction \",concatenated_sentences)\n","\n"," return concatenated_sentences\n","\n","# Process each row in the DataFrame\n","for index, row in df.iterrows():\n"," paper_index = row['paper_index']\n","\n"," # Skip rows where paper_index is NaN or empty\n"," if pd.isna(paper_index) or not paper_index:\n"," continue\n","\n"," # Construct the path to the corresponding folder\n"," paper_folder_path = os.path.join(base_dir_path, paper_index, 'Documents_xml')\n","\n"," try:\n"," # Find the XML file in the Documents_xml folder\n"," xml_files = [f for f in os.listdir(paper_folder_path) if f.endswith('.xml')]\n"," if not xml_files:\n"," raise FileNotFoundError(f\"No XML files found in {paper_folder_path}\")\n","\n"," # Read the XML file content\n"," xml_file_path = os.path.join(paper_folder_path, xml_files[0])\n"," print(xml_file_path)\n"," with open(xml_file_path, 'r', encoding='utf-8') as file:\n"," xml_content = file.read()\n","\n"," # Extract sentences from the XML file\n"," concatenated_sentences = extract_sentences_from_xml(xml_content)\n","\n"," # Concatenate the extracted sentences with the summary_text\n"," summary_with_sentences = concatenated_sentences + row['summary_text']\n","\n"," # Save the result to a new text file\n"," output_file_path = os.path.join(output_dir_path, f\"{paper_index}.txt\")\n"," with open(output_file_path, 'w', encoding='utf-8') as output_file:\n"," output_file.write(summary_with_sentences.lower())\n","\n"," except Exception as e:\n"," print(f\"Error processing {paper_index}: {e}\")\n","\n","print(\"Text files have been saved to the output directory.\")\n"]},{"cell_type":"code","execution_count":20,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QJINv9ByW-qj","outputId":"40ef0904-4cc8-4677-bb1b-95b88ddfb0db","executionInfo":{"status":"ok","timestamp":1719067659283,"user_tz":-240,"elapsed":781654,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Processing completed.\n"]}],"source":["import os\n","import glob\n","\n","# Define the paths\n","source_base_dir = \"/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/scisummnet_final_dataset/top1000_complete\"\n","target_base_dir = \"/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/dataset/targets\"\n","\n","# Create the target directory if it doesn't exist\n","if not os.path.exists(target_base_dir):\n"," os.makedirs(target_base_dir)\n","\n","# Iterate through each folder in the source base directory\n","for folder_name in os.listdir(source_base_dir):\n"," summary_folder_path = os.path.join(source_base_dir, folder_name, 'summary')\n","\n"," if os.path.isdir(summary_folder_path):\n"," # Find all text files in the summary folder\n"," summary_files = glob.glob(os.path.join(summary_folder_path, '*.txt'))\n","\n"," for summary_file in summary_files:\n"," # Read the content of the text file\n"," with open(summary_file, 'r', encoding='utf-8') as file:\n"," content = file.read()\n","\n"," # Process the content: convert to lower case, remove extra spaces and newlines\n"," processed_content = ' '.join(content.lower().split())\n","\n"," # Determine the target file path\n"," target_file_path = os.path.join(target_base_dir, os.path.basename(summary_file))\n","\n"," # Write the processed content to the target file\n"," with open(target_file_path, 'w', encoding='utf-8') as file:\n"," file.write(processed_content)\n","\n","print(\"Processing completed.\")\n"]},{"cell_type":"code","execution_count":25,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tcvcU9M-eZTE","outputId":"07be0bb4-0dd0-4f11-a74b-50a7ed9c258d","executionInfo":{"status":"ok","timestamp":1719067803829,"user_tz":-240,"elapsed":537,"user":{"displayName":"Aditi Paretkar","userId":"17466297872366651006"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["J93-1005.txt\n","C04-1100.txt\n","J08-1002.txt\n","C00-2163.txt\n","D07-1077.txt\n","W09-0441.txt\n","C04-1073.txt\n","W04-2609.txt\n","H94-1020.txt\n","P85-1008.txt\n","J88-2006.txt\n","P88-1012.txt\n","P08-2012.txt\n","W10-3001.txt\n","J91-4003.txt\n","J93-1004.txt\n","P04-1015.txt\n","H05-1073.txt\n","W97-0311.txt\n","P07-1123.txt\n","J10-4006.txt\n","P84-1085.txt\n","W06-3119.txt\n","J99-2004.txt\n","P99-1069.txt\n","C00-2136.txt\n","C96-1005.txt\n","C86-1016.txt\n","P84-1008.txt\n","W05-1513.txt\n","P91-1023.txt\n","P08-2007.txt\n","N01-1021.txt\n","P08-1067.txt\n","A00-2004.txt\n","N03-1017.txt\n","N09-1036.txt\n","P03-2041.txt\n","P89-1010.txt\n","H05-1012.txt\n","J05-4003.txt\n","P98-1081.txt\n","P96-1042.txt\n","P98-1069.txt\n","W04-3250.txt\n","P07-1004.txt\n","P02-1034.txt\n","N07-1011.txt\n","P02-1031.txt\n","P05-1074.txt\n","J97-2003.txt\n","A00-2019.txt\n","W03-1719.txt\n","W95-0103.txt\n","H93-1051.txt\n","E06-1002.txt\n","W08-2121.txt\n","E03-1005.txt\n","N06-1058.txt\n","D08-1059.txt\n","P06-2006.txt\n","C94-1079.txt\n","W04-3213.txt\n","P11-1020.txt\n","P03-1029.txt\n","W02-0505.txt\n","J97-1003.txt\n","H91-1060.txt\n","C04-1024.txt\n","D07-1103.txt\n","N09-1046.txt\n","P07-2045.txt\n","W97-0703.txt\n","W04-3208.txt\n","P05-1012.txt\n","P06-1103.txt\n","W03-1011.txt\n","P97-1003.txt\n","W07-0718.txt\n","J94-4002.txt\n","C04-1080.txt\n","W00-1308.txt\n","W02-1210.txt\n","W96-0208.txt\n","D09-1030.txt\n","P02-1019.txt\n","A97-1029.txt\n","J03-4004.txt\n","P02-1022.txt\n","P01-1030.txt\n","C88-2121.txt\n","P03-1056.txt\n","P87-1015.txt\n","S10-1010.txt\n","P06-1095.txt\n","P08-1004.txt\n","W02-1006.txt\n","W06-3812.txt\n","D07-1074.txt\n","D07-1080.txt\n","C88-2128.txt\n","P09-1039.txt\n","N09-1028.txt\n","W00-1201.txt\n","J96-1002.txt\n","C04-1111.txt\n","N03-1003.txt\n","W04-3103.txt\n","J08-4003.txt\n","W04-3205.txt\n","P04-1036.txt\n","P09-1094.txt\n","J00-4003.txt\n","D11-1062.txt\n","N06-1006.txt\n","J00-1004.txt\n","C94-2178.txt\n","J97-4005.txt\n","W03-0424.txt\n","J99-4004.txt\n","P07-1092.txt\n","N04-1023.txt\n","H94-1046.txt\n","J93-1003.txt\n","W97-0301.txt\n","P99-1067.txt\n","W98-0705.txt\n","N04-1025.txt\n","C04-1197.txt\n","W02-1021.txt\n","N13-1039.txt\n","P96-1027.txt\n","W99-0625.txt\n","P08-1028.txt\n","P03-1013.txt\n","N03-1016.txt\n","N04-1014.txt\n","P02-1033.txt\n","D07-1096.txt\n","D08-1020.txt\n","D08-1014.txt\n","P08-1036.txt\n","E06-1005.txt\n","C92-3150.txt\n","P94-1013.txt\n","I05-3027.txt\n","E06-1015.txt\n","W04-2406.txt\n","P07-1019.txt\n","C90-2067.txt\n","W97-0109.txt\n","W07-0734.txt\n","P03-1058.txt\n","C02-1011.txt\n","J03-1005.txt\n","P11-2031.txt\n","P03-1004.txt\n","W03-1730.txt\n","W10-0204.txt\n","P90-1010.txt\n","P91-1027.txt\n","W97-1306.txt\n","P04-1056.txt\n","W02-2016.txt\n","D10-1125.txt\n","H93-1061.txt\n","D08-1065.txt\n","P89-1031.txt\n","S10-1011.txt\n","N06-1025.txt\n","D07-1091.txt\n","P11-1060.txt\n","N10-1119.txt\n","P05-1073.txt\n","W11-2103.txt\n","P06-1077.txt\n","J04-1002.txt\n","W06-2933.txt\n","N04-1021.txt\n","D09-1001.txt\n","P97-1023.txt\n","P02-1043.txt\n","W98-1106.txt\n","P04-1083.txt\n","W07-2002.txt\n","P11-1098.txt\n","P09-1113.txt\n","W97-0713.txt\n","J05-1003.txt\n","W04-3206.txt\n","P05-1036.txt\n","W97-0209.txt\n","C94-2195.txt\n","E06-1025.txt\n","C90-3030.txt\n","C90-3052.txt\n","D08-1035.txt\n","E06-1042.txt\n","P04-1035.txt\n","P06-4020.txt\n","N07-1051.txt\n","C04-1146.txt\n","H05-1045.txt\n","E06-1031.txt\n","P03-1002.txt\n","P99-1065.txt\n","P06-2014.txt\n","E06-1038.txt\n","N07-1029.txt\n","N04-1033.txt\n","H93-1052.txt\n","H05-1059.txt\n","E06-1032.txt\n","P06-1066.txt\n","P98-2204.txt\n","W02-1028.txt\n","C02-1114.txt\n","E99-1023.txt\n","P03-1001.txt\n","P08-1076.txt\n","W07-0733.txt\n","J96-3004.txt\n","P99-1048.txt\n","W04-2401.txt\n","N12-1052.txt\n","P85-1018.txt\n","P07-1056.txt\n","P99-1068.txt\n","P03-1021.txt\n","J02-3001.txt\n","J82-3004.txt\n","P96-1041.txt\n","W11-1901.txt\n","A00-2034.txt\n","P00-1016.txt\n","P93-1003.txt\n","A94-1006.txt\n","N12-1067.txt\n","P07-1125.txt\n","W04-2705.txt\n","J97-3002.txt\n","W08-1301.txt\n","P86-1031.txt\n","P09-1011.txt\n","P06-1121.txt\n","P10-1146.txt\n","N06-1003.txt\n","P88-1020.txt\n","W00-0717.txt\n","C92-3126.txt\n","W04-3212.txt\n","D07-1072.txt\n","P94-1012.txt\n","P92-1005.txt\n","J90-1004.txt\n","J92-4003.txt\n","P97-1063.txt\n","P02-1035.txt\n","M95-1012.txt\n","E03-1076.txt\n","P92-1032.txt\n","P01-1064.txt\n","J88-1003.txt\n","D09-1086.txt\n","P98-1013.txt\n","P02-1053.txt\n","J02-1003.txt\n","N06-1041.txt\n","D09-1092.txt\n","W09-1304.txt\n","N01-1011.txt\n","E03-1009.txt\n","P08-1088.txt\n","N03-1030.txt\n","N01-1026.txt\n","N06-2015.txt\n","W04-2407.txt\n","W04-3219.txt\n","J05-1004.txt\n","W02-1503.txt\n","N07-4013.txt\n","P06-1067.txt\n","A94-1009.txt\n","J92-1004.txt\n","W07-2018.txt\n","N07-1047.txt\n","D08-1076.txt\n","P02-1040.txt\n","W02-1001.txt\n","C96-1055.txt\n","N04-1035.txt\n","W09-1105.txt\n","N12-1047.txt\n","P03-1054.txt\n","W97-0322.txt\n","J93-1002.txt\n","W10-2805.txt\n","P05-1052.txt\n","P93-1005.txt\n","N09-1009.txt\n","P04-1054.txt\n","W05-1203.txt\n","P91-1017.txt\n","P99-1032.txt\n","P83-1019.txt\n","J98-1001.txt\n","N06-1020.txt\n","P05-1011.txt\n","P03-1003.txt\n","C08-1018.txt\n","P05-1047.txt\n","W97-0313.txt\n","N06-1014.txt\n","P08-1102.txt\n","P99-1042.txt\n","C08-1098.txt\n","J99-3001.txt\n","E89-1009.txt\n","P00-1056.txt\n","P03-1044.txt\n","D08-1089.txt\n","W09-0424.txt\n","P06-1104.txt\n","A97-1004.txt\n","P09-1042.txt\n","W06-1639.txt\n","W04-1013.txt\n","P08-1084.txt\n","N01-1023.txt\n","W04-2319.txt\n","J03-3005.txt\n","P00-1010.txt\n","P93-1023.txt\n","P02-1038.txt\n","J09-3003.txt\n","J98-1006.txt\n","P03-1019.txt\n","J88-2003.txt\n","P98-1035.txt\n","W03-1017.txt\n","P06-3002.txt\n","N06-1039.txt\n","D10-1124.txt\n","D11-1141.txt\n","P06-1055.txt\n","P07-1065.txt\n","J01-3003.txt\n","P02-1018.txt\n","W95-0101.txt\n","C94-1027.txt\n","W99-0612.txt\n","W11-1801.txt\n","P96-1025.txt\n","W04-0811.txt\n","N04-4038.txt\n","N07-1023.txt\n","N10-1063.txt\n","P08-1108.txt\n","P83-1021.txt\n","P06-1109.txt\n","J96-1001.txt\n","P07-1034.txt\n","P00-1071.txt\n","P02-1051.txt\n","W01-1313.txt\n","N06-1056.txt\n","P05-1022.txt\n","C10-1011.txt\n","P04-1018.txt\n","D09-1005.txt\n","D09-1058.txt\n","P04-1021.txt\n","P06-1091.txt\n","L08-1093.txt\n","P06-1032.txt\n","P11-1061.txt\n","D11-1006.txt\n","E06-1027.txt\n","J98-2001.txt\n","W11-0705.txt\n","P07-1091.txt\n","W98-1115.txt\n","P08-1086.txt\n","P98-1046.txt\n","D07-1111.txt\n","P07-1030.txt\n","W03-1006.txt\n","J06-1003.txt\n","D07-1071.txt\n","P06-2101.txt\n","W07-0403.txt\n","J98-2004.txt\n","D07-1002.txt\n","J02-1002.txt\n","P04-1005.txt\n","P07-1096.txt\n","P07-1028.txt\n","N03-1014.txt\n","N03-1020.txt\n","W06-3808.txt\n","P93-1022.txt\n","D10-1044.txt\n","W08-2123.txt\n","J94-4003.txt\n","P09-1116.txt\n","P06-1043.txt\n","C00-1007.txt\n","W96-0102.txt\n","A92-1021.txt\n","W02-1018.txt\n","W94-0319.txt\n","J98-3005.txt\n","N03-2021.txt\n","P04-1085.txt\n","J94-2003.txt\n","E06-1040.txt\n","N04-1041.txt\n","N13-1090.txt\n","P07-1036.txt\n","A00-2026.txt\n","P08-1064.txt\n","D10-1048.txt\n","W04-3252.txt\n","C94-2174.txt\n","P97-1009.txt\n","P96-1038.txt\n","W01-1605.txt\n","P94-1002.txt\n","P97-1005.txt\n","P09-1027.txt\n","N01-1016.txt\n","W03-0419.txt\n","P93-1035.txt\n","H05-1079.txt\n","C02-1054.txt\n","J87-1005.txt\n","W95-0105.txt\n","P97-1013.txt\n","J06-3003.txt\n","J07-4004.txt\n","P02-1042.txt\n","J97-1002.txt\n","W02-1502.txt\n","J95-4004.txt\n","W03-1008.txt\n","P10-4002.txt\n","P07-1059.txt\n","N09-1025.txt\n","N01-1025.txt\n","W04-3237.txt\n","J98-2002.txt\n","C94-1032.txt\n","W02-1011.txt\n","P86-1004.txt\n","P09-1026.txt\n","J03-1002.txt\n","P02-1006.txt\n","W07-2009.txt\n","J00-3003.txt\n","W95-0107.txt\n","C96-1021.txt\n","H94-1048.txt\n","P05-1066.txt\n","P02-1039.txt\n","A00-2018.txt\n","C90-3045.txt\n","P10-1001.txt\n","W10-2903.txt\n","P98-1029.txt\n","W05-0904.txt\n","D11-1142.txt\n","W05-0625.txt\n","P99-1016.txt\n","W99-0611.txt\n","D08-1036.txt\n","D12-1050.txt\n","W03-0407.txt\n","C86-1045.txt\n","W03-1014.txt\n","P00-1027.txt\n","J95-2002.txt\n","P98-2143.txt\n","N03-1026.txt\n","W07-2006.txt\n","N01-1008.txt\n","W96-0213.txt\n","C08-1114.txt\n","S12-1053.txt\n","C08-1107.txt\n","A00-1043.txt\n","P02-1047.txt\n","P02-1017.txt\n","P87-1022.txt\n","P05-1013.txt\n","J10-3003.txt\n","H01-1035.txt\n","W07-2216.txt\n","C04-1041.txt\n","J94-4001.txt\n","P04-1066.txt\n","P09-2004.txt\n","C90-3063.txt\n","C04-1010.txt\n","J91-1003.txt\n","P95-1021.txt\n","C92-1038.txt\n","W09-0401.txt\n","D08-1021.txt\n","J92-1001.txt\n","J98-4004.txt\n","J99-4005.txt\n","P03-1051.txt\n","D10-1119.txt\n","W01-0511.txt\n","J07-3004.txt\n","P07-1049.txt\n","D08-1011.txt\n","C96-1079.txt\n","W06-1615.txt\n","W01-0501.txt\n","D12-1133.txt\n","D09-1098.txt\n","P93-1002.txt\n","D11-1129.txt\n","W06-3108.txt\n","N06-1011.txt\n","P10-1052.txt\n","P04-1061.txt\n","N07-1071.txt\n","W07-0702.txt\n","P93-1020.txt\n","W02-2018.txt\n","P06-2094.txt\n","W00-1401.txt\n","W04-3253.txt\n","N04-1001.txt\n","C88-2147.txt\n","W06-1203.txt\n","W04-3236.txt\n","P09-1104.txt\n","I08-1059.txt\n","D07-1031.txt\n","N04-3012.txt\n","N09-1041.txt\n","W06-3114.txt\n","I05-2038.txt\n","H92-1045.txt\n","W99-0604.txt\n","C02-1145.txt\n","J93-2002.txt\n","N03-1021.txt\n","W06-2920.txt\n","W11-2123.txt\n","D10-1120.txt\n","H05-1043.txt\n","E99-1001.txt\n","P06-1115.txt\n","W01-0513.txt\n","P02-1050.txt\n","P06-1038.txt\n","W02-2024.txt\n","P03-1011.txt\n","P10-1044.txt\n","A94-1016.txt\n","P05-1017.txt\n","N01-1020.txt\n","N03-1033.txt\n","P10-1040.txt\n","P98-1010.txt\n","P08-1114.txt\n","P05-1072.txt\n","J87-1004.txt\n","N03-2002.txt\n","W11-2107.txt\n","N10-1013.txt\n","W00-0726.txt\n","W97-0302.txt\n","P05-1010.txt\n","P83-1007.txt\n","P06-1014.txt\n","P08-1024.txt\n","P96-1008.txt\n","C92-1019.txt\n","P08-1023.txt\n","P08-1085.txt\n","P01-1025.txt\n","A92-1018.txt\n","J93-1006.txt\n","E06-1011.txt\n","J90-2002.txt\n","P00-1065.txt\n","H05-1011.txt\n","H05-1066.txt\n","W95-0115.txt\n","W12-3102.txt\n","H05-1091.txt\n","P02-1014.txt\n","W04-0308.txt\n","C88-1016.txt\n","P04-1075.txt\n","N04-1042.txt\n","P11-2008.txt\n","W03-1728.txt\n","P09-1068.txt\n","P95-1007.txt\n","P93-1024.txt\n","I05-3025.txt\n","J02-4002.txt\n","P09-1010.txt\n","D08-1092.txt\n","P07-1098.txt\n","W04-3230.txt\n","W03-0430.txt\n","N04-4015.txt\n","W99-0501.txt\n","J98-4003.txt\n","J96-2004.txt\n","W06-2922.txt\n","P05-1015.txt\n","J03-4003.txt\n","P09-2012.txt\n","J01-2001.txt\n","W97-0119.txt\n","J94-4004.txt\n","P92-1008.txt\n","I05-3017.txt\n","P00-1058.txt\n","W00-1427.txt\n","N04-1015.txt\n","P07-1107.txt\n","J93-1007.txt\n","P03-1023.txt\n","P90-1032.txt\n","W99-0623.txt\n","P11-1038.txt\n","P08-1066.txt\n","P06-1084.txt\n","P07-1003.txt\n","W03-1809.txt\n","A00-2009.txt\n","C08-1022.txt\n","P95-1037.txt\n","W08-0309.txt\n","D08-1024.txt\n","P07-1040.txt\n","W00-0712.txt\n","N03-1028.txt\n","J07-2003.txt\n","C04-1200.txt\n","P03-1009.txt\n","P07-1037.txt\n","W00-0403.txt\n","J94-3001.txt\n","H05-1010.txt\n","D09-1101.txt\n","W07-0717.txt\n","W98-1118.txt\n","W99-0613.txt\n","P06-1134.txt\n","J08-1001.txt\n","P08-1030.txt\n","J08-2005.txt\n","C00-1044.txt\n","W06-2501.txt\n","W07-2014.txt\n","P98-1034.txt\n","H92-1026.txt\n","W02-0301.txt\n","J94-2001.txt\n","P96-1006.txt\n","P08-1043.txt\n","P05-1059.txt\n","P97-1041.txt\n","P97-1035.txt\n","P93-1001.txt\n","J00-3004.txt\n","P87-1033.txt\n","J92-4007.txt\n","J93-2006.txt\n","P90-1034.txt\n","P12-1092.txt\n","W02-0817.txt\n","N04-1013.txt\n","P07-1121.txt\n","P05-1034.txt\n","D08-1022.txt\n","H05-1021.txt\n","P07-1031.txt\n","P99-1071.txt\n","W00-0730.txt\n","D09-1026.txt\n","P13-1045.txt\n","W09-0432.txt\n","J08-4004.txt\n","P03-1035.txt\n","W06-1606.txt\n","P08-1090.txt\n","D08-1016.txt\n","W04-1221.txt\n","P06-1097.txt\n","N03-1024.txt\n","C96-2141.txt\n","W10-0701.txt\n","C92-2070.txt\n","P94-1020.txt\n","J86-3001.txt\n","J04-3002.txt\n","P04-1014.txt\n","J93-2003.txt\n","J95-2003.txt\n","D10-1001.txt\n","N03-1022.txt\n","J81-4003.txt\n","J02-2003.txt\n","W03-0501.txt\n","P99-1059.txt\n","N06-2033.txt\n","W06-1642.txt\n","P95-1034.txt\n","J00-2004.txt\n","P96-1011.txt\n","P01-1005.txt\n","P91-1034.txt\n","W06-1651.txt\n","W04-0803.txt\n","P00-1041.txt\n","P88-1015.txt\n","P02-1001.txt\n","H05-2018.txt\n","D11-1014.txt\n","W02-0902.txt\n","A00-1031.txt\n","D07-1109.txt\n","P06-1005.txt\n","C96-2183.txt\n","D07-1090.txt\n","W02-0908.txt\n","P94-1019.txt\n","D07-1104.txt\n","P98-1106.txt\n","J93-2005.txt\n","W06-1670.txt\n","P07-1055.txt\n","W01-0514.txt\n","C10-2028.txt\n","P08-2026.txt\n","W04-3111.txt\n","D09-1120.txt\n","P03-2026.txt\n","P05-1053.txt\n","W06-3105.txt\n","W08-0509.txt\n","W07-2016.txt\n","D07-1043.txt\n","P99-1014.txt\n","D09-1127.txt\n","M95-1005.txt\n","P04-1053.txt\n","A97-1030.txt\n","P08-1119.txt\n","W04-3247.txt\n","P09-1074.txt\n","W11-1902.txt\n","E09-1005.txt\n","P98-2173.txt\n","N09-1037.txt\n","P09-1019.txt\n","W06-1607.txt\n","P98-2180.txt\n","P08-1109.txt\n","W06-0301.txt\n","A97-1014.txt\n","C04-1180.txt\n","P97-1017.txt\n","W98-1119.txt\n","P89-1002.txt\n","P02-1062.txt\n","D07-1007.txt\n","D08-1027.txt\n","P98-1112.txt\n","W08-2102.txt\n","C02-2025.txt\n","P93-1008.txt\n","D07-1097.txt\n","C04-1072.txt\n","J97-3003.txt\n","P07-1094.txt\n","P05-1067.txt\n","J01-3001.txt\n","P90-1005.txt\n","P01-1019.txt\n","C92-1025.txt\n","W03-1810.txt\n","P06-2005.txt\n","P96-1024.txt\n","P05-1071.txt\n","J99-1003.txt\n","J04-4002.txt\n","P98-1012.txt\n","W01-0521.txt\n","J93-1001.txt\n","P02-1046.txt\n","N10-1020.txt\n","P84-1075.txt\n","P92-1017.txt\n","P11-1019.txt\n","P07-1106.txt\n","P05-1077.txt\n","W10-1703.txt\n","P99-1004.txt\n","J90-1003.txt\n","N09-2004.txt\n","W04-3239.txt\n","H05-1004.txt\n","J01-2004.txt\n","P85-1011.txt\n","J80-3003.txt\n","P93-1041.txt\n","N04-1022.txt\n","C92-2066.txt\n","P09-1088.txt\n","P04-1043.txt\n","W03-0301.txt\n","A92-1006.txt\n","P05-1045.txt\n","P05-1033.txt\n","H05-1053.txt\n","D08-1031.txt\n","P06-1004.txt\n","D08-1083.txt\n","P03-1012.txt\n","P01-1067.txt\n","E99-1010.txt\n","J08-2002.txt\n","W99-0629.txt\n","C08-1109.txt\n","C10-1152.txt\n","P04-1041.txt\n","W04-3207.txt\n","W08-0336.txt\n","P11-2033.txt\n","W02-0109.txt\n","C00-1072.txt\n","P05-1020.txt\n","J01-2002.txt\n","D07-1101.txt\n","J04-1005.txt\n","N10-1056.txt\n","W98-1411.txt\n","C02-1139.txt\n","P09-1058.txt\n","C10-2005.txt\n","P08-1115.txt\n","P10-1142.txt\n","D08-1068.txt\n","P08-1068.txt\n","J93-2004.txt\n","P05-1044.txt\n","P00-1037.txt\n","W03-1812.txt\n","W05-0602.txt\n","N10-1061.txt\n","W09-1401.txt\n","C92-2082.txt\n","N04-1030.txt\n","P08-1101.txt\n","N10-1115.txt\n","D08-1082.txt\n","P03-1071.txt\n","P06-1114.txt\n","P07-1073.txt\n","C04-1051.txt\n","N01-1024.txt\n","D07-1061.txt\n","P05-1065.txt\n","W07-1401.txt\n","P07-1005.txt\n","P08-1012.txt\n","W06-2915.txt\n","N09-1012.txt\n","W03-0404.txt\n","P06-1010.txt\n","W05-0909.txt\n","P11-1138.txt\n","E87-1002.txt\n","A00-2030.txt\n","C04-1059.txt\n","P07-1007.txt\n","W09-1119.txt\n","C00-2137.txt\n","P06-1015.txt\n","N01-1006.txt\n","P91-1030.txt\n","W02-1039.txt\n","C96-1058.txt\n","H91-1026.txt\n","J05-3002.txt\n","P02-1060.txt\n","P98-2182.txt\n","P06-1123.txt\n","P06-2066.txt\n","W07-1604.txt\n","P04-3022.txt\n","W07-2012.txt\n","P06-1072.txt\n","H05-1044.txt\n","W04-3201.txt\n","P05-3026.txt\n","A00-2024.txt\n","D07-1114.txt\n","A88-1019.txt\n","J03-1003.txt\n","P99-1008.txt\n","C94-1042.txt\n","E03-1008.txt\n","P98-2177.txt\n","A97-1039.txt\n","J04-4004.txt\n","N04-1019.txt\n","N07-1018.txt\n","N09-1003.txt\n","W03-1508.txt\n","N04-1016.txt\n","W02-2026.txt\n","P11-1016.txt\n","P99-1041.txt\n","D11-1033.txt\n","P09-1057.txt\n","P11-1055.txt\n","P06-1009.txt\n","W02-0603.txt\n","J97-1005.txt\n","W95-0104.txt\n","D07-1076.txt\n","P07-1032.txt\n","P09-1040.txt\n","P10-1110.txt\n","P95-1026.txt\n","P05-2008.txt\n","P06-1124.txt\n","D09-1159.txt\n","C04-1046.txt\n","J01-4004.txt\n","P89-1009.txt\n","P03-1022.txt\n","W97-0802.txt\n","C02-1144.txt\n","J00-4005.txt\n","W03-0425.txt\n","C90-3044.txt\n","P05-1001.txt\n","N06-2013.txt\n","A00-2031.txt\n","E03-1071.txt\n","P83-1020.txt\n","N06-1033.txt\n","W11-1802.txt\n","P84-1018.txt\n","A97-1011.txt\n","P96-1021.txt\n","J91-1002.txt\n","C04-1081.txt\n","P06-1101.txt\n","W05-1506.txt\n","P03-1010.txt\n","C02-1150.txt\n","P05-1018.txt\n","E89-1037.txt\n","P04-1013.txt\n","D10-1115.txt\n","W03-1028.txt\n","E06-1043.txt\n","N04-1043.txt\n","W03-0428.txt\n","J04-2003.txt\n","P04-1077.txt\n","W04-0807.txt\n","P91-1022.txt\n","P93-1016.txt\n","W06-1616.txt\n","E06-1051.txt\n","J03-3002.txt\n","N07-1030.txt\n","P06-1085.txt\n","P95-1050.txt\n","A97-1052.txt\n","N04-4026.txt\n","N07-1038.txt\n","P98-2127.txt\n","W93-0301.txt\n","P05-1057.txt\n","W96-0214.txt\n","J93-3003.txt\n","P01-1008.txt\n","P09-1077.txt\n","D11-1125.txt\n","N10-1019.txt\n","P01-1017.txt\n","W06-3601.txt\n","P93-1032.txt\n","E09-1013.txt\n","W00-1303.txt\n","P06-1011.txt\n","J03-3001.txt\n","W06-2932.txt\n","W03-0405.txt\n","D07-1003.txt\n","P03-1069.txt\n","P10-2041.txt\n","Renaming completed.\n"]}],"source":["import os\n","\n","# Define the path to the target directory\n","target_base_dir = \"/content/drive/MyDrive/RA_Internship/PACSUM/DATASET_PACSUM/dataset/targets\"\n","\n","# Iterate through each file in the target directory\n","for filename in os.listdir(target_base_dir):\n"," print(filename)\n"," # Check if the file name contains \".gold\"\n"," if '.gold' in filename:\n"," # Create the new filename by replacing \".gold\" with an empty string\n"," new_filename = filename.replace('.gold', '')\n","\n"," # Construct the full paths for the old and new filenames\n"," old_file_path = os.path.join(target_base_dir, filename)\n"," new_file_path = os.path.join(target_base_dir, new_filename)\n","\n"," # Rename the file\n"," os.rename(old_file_path, new_file_path)\n","\n","print(\"Renaming completed.\")\n"]}],"metadata":{"colab":{"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0} \ No newline at end of file diff --git a/DATASET_PACSUM/README.md b/DATASET_PACSUM/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c76fe3dd613b1b9c244cf225b8138bb2926ae338 --- /dev/null +++ b/DATASET_PACSUM/README.md @@ -0,0 +1,109 @@ +--- +license: apache-2.0 +base_model: allenai/led-base-16384 +tags: +- generated_from_trainer +model-index: +- name: DATASET_PACSUM + results: [] +--- + + + +# DATASET_PACSUM + +This model is a fine-tuned version of [allenai/led-base-16384](https://huggingface.co/allenai/led-base-16384) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 2.5461 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- num_epochs: 5 +- mixed_precision_training: Native AMP + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:-----:|:----:|:---------------:| +| 2.8648 | 0.1 | 10 | 2.8816 | +| 2.9889 | 0.2 | 20 | 2.7866 | +| 3.0516 | 0.3 | 30 | 2.7394 | +| 2.6605 | 0.4 | 40 | 2.7132 | +| 2.8093 | 0.5 | 50 | 2.6759 | +| 2.9206 | 0.6 | 60 | 2.6607 | +| 2.8094 | 0.7 | 70 | 2.6576 | +| 2.5233 | 0.8 | 80 | 2.6327 | +| 2.6508 | 0.9 | 90 | 2.6117 | +| 2.8456 | 1.0 | 100 | 2.5861 | +| 2.4622 | 1.1 | 110 | 2.5942 | +| 2.2871 | 1.2 | 120 | 2.5751 | +| 2.4482 | 1.3 | 130 | 2.5776 | +| 2.4079 | 1.4 | 140 | 2.5777 | +| 2.2842 | 1.5 | 150 | 2.5621 | +| 2.6267 | 1.6 | 160 | 2.5463 | +| 2.3895 | 1.7 | 170 | 2.5503 | +| 2.2786 | 1.8 | 180 | 2.5470 | +| 2.3628 | 1.9 | 190 | 2.5420 | +| 2.2809 | 2.0 | 200 | 2.5367 | +| 2.2726 | 2.1 | 210 | 2.5405 | +| 2.1934 | 2.2 | 220 | 2.5676 | +| 2.2447 | 2.3 | 230 | 2.5399 | +| 2.4508 | 2.4 | 240 | 2.5435 | +| 2.2969 | 2.5 | 250 | 2.5490 | +| 2.4206 | 2.6 | 260 | 2.5317 | +| 2.0131 | 2.7 | 270 | 2.5378 | +| 2.0025 | 2.8 | 280 | 2.5492 | +| 2.2179 | 2.9 | 290 | 2.5280 | +| 2.2082 | 3.0 | 300 | 2.5190 | +| 1.9491 | 3.1 | 310 | 2.5608 | +| 2.291 | 3.2 | 320 | 2.5448 | +| 2.0431 | 3.3 | 330 | 2.5319 | +| 2.0671 | 3.4 | 340 | 2.5529 | +| 2.1939 | 3.5 | 350 | 2.5388 | +| 2.0606 | 3.6 | 360 | 2.5306 | +| 2.0088 | 3.7 | 370 | 2.5557 | +| 2.1919 | 3.8 | 380 | 2.5317 | +| 2.2516 | 3.9 | 390 | 2.5290 | +| 1.9401 | 4.0 | 400 | 2.5404 | +| 2.1101 | 4.1 | 410 | 2.5354 | +| 1.8906 | 4.2 | 420 | 2.5520 | +| 1.9808 | 4.3 | 430 | 2.5488 | +| 1.8195 | 4.4 | 440 | 2.5496 | +| 1.8512 | 4.5 | 450 | 2.5535 | +| 2.0464 | 4.6 | 460 | 2.5519 | +| 2.0176 | 4.7 | 470 | 2.5450 | +| 2.0686 | 4.8 | 480 | 2.5460 | +| 2.0267 | 4.9 | 490 | 2.5463 | +| 1.8617 | 5.0 | 500 | 2.5461 | + + +### Framework versions + +- Transformers 4.41.2 +- Pytorch 2.3.0+cu121 +- Datasets 2.20.0 +- Tokenizers 0.19.1 diff --git a/DATASET_PACSUM/config.json b/DATASET_PACSUM/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5d0ce183dddc8769c4e51fd23d8ea605c44f37d6 --- /dev/null +++ b/DATASET_PACSUM/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "allenai/led-base-16384", + "activation_dropout": 0.0, + "activation_function": "gelu", + "architectures": [ + "LEDForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_window": [ + 1024, + 1024, + 1024, + 1024, + 1024, + 1024 + ], + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 768, + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.0, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.0, + "encoder_layers": 6, + "eos_token_id": 2, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "length_penalty": 2.0, + "max_decoder_position_embeddings": 1024, + "max_encoder_position_embeddings": 16384, + "max_length": 512, + "min_length": 100, + "model_type": "led", + "no_repeat_ngram_size": 3, + "num_beams": 2, + "num_hidden_layers": 6, + "pad_token_id": 1, + "torch_dtype": "float32", + "transformers_version": "4.41.2", + "use_cache": false, + "vocab_size": 50265 +} diff --git a/DATASET_PACSUM/dataset/inputs/A00-1031.txt b/DATASET_PACSUM/dataset/inputs/A00-1031.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cb6cb17759a2ce42110f0acfbf5cce80d1b2ced --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-1031.txt @@ -0,0 +1 @@ +a large number of current language processing systems use a part-of-speech tagger for pre-processing. the tagger assigns a (unique or ambiguous) part-ofspeech tag to each token in the input and passes its output to the next processing level, usually a parser. furthermore, there is a large interest in part-ofspeech tagging for corpus annotation projects, who create valuable linguistic resources by a combination of automatic processing and human correction. for both applications, a tagger with the highest possible accuracy is required. the debate about which paradigm solves the part-of-speech tagging problem best is not finished. recent comparisons of approaches that can be trained on corpora (van halteren et al., 1998; volk and schneider, 1998) have shown that in most cases statistical aproaches (cutting et al., 1992; schmid, 1995; ratnaparkhi, 1996) yield better results than finite-state, rule-based, or memory-based taggers (brill, 1993; daelemans et al., 1996). they are only surpassed by combinations of different systems, forming a "voting tagger". among the statistical approaches, the maximum entropy framework has a very strong position. nevertheless, a recent independent comparison of 7 taggers (zavrel and daelemans, 1999) has shown that another approach even works better: markov models combined with a good smoothing technique and with handling of unknown words. this tagger, tnt, not only yielded the highest accuracy, it also was the fastest both in training and tagging. the tagger comparison was organized as a "blackbox test": set the same task to every tagger and compare the outcomes. this paper describes the models and techniques used by tnt together with the implementation. the reader will be surprised how simple the underlying model is. the result of the tagger comparison seems to support the maxime "the simplest is the best". however, in this paper we clarify a number of details that are omitted in major previous publications concerning tagging with markov models. as two examples, (rabiner, 1989) and (charniak et al., 1993) give good overviews of the techniques and equations used for markov models and part-ofspeech tagging, but they are not very explicit in the details that are needed for their application. we argue that it is not only the choice of the general model that determines the result of the tagger but also the various "small" decisions on alternatives. the aim of this paper is to give a detailed account of the techniques used in tnt. additionally, we present results of the tagger on the negra corpus (brants et al., 1999) and the penn treebank (marcus et al., 1993). the penn treebank results reported here for the markov model approach are at least equivalent to those reported for the maximum entropy approach in (ratnaparkhi, 1996). for a comparison to other taggers, the reader is referred to (zavrel and daelemans, 1999).tnt is freely available to universities and related organizations for research purposes (see http://www.coli.uni-sb.derthorstenant). a large number of current language processing systems use a part-of-speech tagger for pre-processing. for a comparison to other taggers, the reader is referred to (zavrel and daelemans, 1999). we have shown that a tagger based on markov models yields state-of-the-art results, despite contrary claims found in the literature. the penn treebank results reported here for the markov model approach are at least equivalent to those reported for the maximum entropy approach in (ratnaparkhi, 1996). the tagger assigns a (unique or ambiguous) part-ofspeech tag to each token in the input and passes its output to the next processing level, usually a parser. furthermore, there is a large interest in part-ofspeech tagging for corpus annotation projects, who create valuable linguistic resources by a combination of automatic processing and human correction. additionally, we present results of the tagger on the negra corpus (brants et al., 1999) and the penn treebank (marcus et al., 1993). for example, the markov model tagger used in the comparison of (van halteren et al., 1998) yielded worse results than all other taggers. it is a very interesting future research topic to determine the advantages of either of these approaches, to find the reason for their high accuracies, and to find a good combination of both. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-1043.txt b/DATASET_PACSUM/dataset/inputs/A00-1043.txt new file mode 100644 index 0000000000000000000000000000000000000000..c335466e44fe1721574d30d6178c9ce492bec2d6 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-1043.txt @@ -0,0 +1 @@ +current automatic summarizers usually rely on sentence extraction to produce summaries. human professionals also often reuse the input documents to generate summaries; however, rather than simply extracting sentences and stringing them together, as most current summarizers do, humans often "edit" the extracted sentences in some way so that the resulting summary is concise and coherent. we analyzed a set of articles and identified six major operations that can be used for editing the extracted sentences, including removing extraneous phrases from an extracted sentence, combining a reduced sentence with other sentences, syntactic transformation, substituting phrases in an extracted sentence with their paraphrases, substituting phrases with more general or specific descriptions, and reordering the extracted sentences (jing and mckeown, 1999; jing and mckeown, 2000). we call the operation of removing extraneous phrases from an extracted sentence sentence reduction. it is one of the most effective operations that can be used to edit the extracted sentences. reduction can remove material at any granularity: a word, a prepositional phrase, a gerund, a to-infinitive or a clause. we use the term "phrase" here to refer to any of the above components that can be removed in reduction. the following example shows an original sentence and its reduced form written by a human professional: original sentence: when it arrives sometime next year in new tv sets, the v-chip will give parents a new and potentially revolutionary device to block out programs they don't want their children to see. reduced sentence by humans: the v-chip will give parents a device to block out programs they don't want their children to see. we implemented an automatic sentence reduction system. input to the reduction system includes extracted sentences, as well as the original document. output of reduction are reduced forms of the extracted sentences, which can either be used to produce summaries directly, or be merged with other sentences. the reduction system uses multiple sources of knowledge to make reduction decisions, including syntactic knowledge, context, and statistics computed from a training corpus. we evaluated the system against the output of human professionals. the program achieved a success rate of 81.3%, meaning that 81.3% of reduction decisions made by the system agreed with those of humans. sentence reduction improves the conciseness of automatically generated summaries, making it concise and on target. it can also improve the coherence of generated summaries, since extraneous phrases that can potentially introduce incoherece are removed. we collected 500 sentences and their corresponding reduced forms written by humans, and found that humans reduced the length of these 500 sentences by 44.2% on average. this indicates that a good sentence reduction system can improve the conciseness of generated summaries significantly. in the next section, we describe the sentence reduction algorithm in details. in section 3, we introduce the evaluation scheme used to access the performance of the system and present evaluation results. in section 4, we discuss other applications of sentence reduction, the interaction between reduction and other modules in a summarization system, and related work on sentence simplication. finally, we the goal of sentence reduction is to "reduce without major loss"; that is, we want to remove as many extraneous phrases as possible from an extracted sentence so that it can be concise, but without detracting from the main idea the sentence conveys. ideally, we want to remove a phrase from an extracted sentence only if it is irrelevant to the main topic. to achieve this, the system relies on multiple sources of knowledge to make reduction decisions. we first introduce the resources in the system and then describe the reduction algorithm. (1) the corpus. one of the key features of the system is that it uses a corpus consisting of original sentences and their corresponding reduced forms written by humans for training and testing purpose. this corpus was created using an automatic program we have developed to automatically analyze human-written abstracts. the program, called the decomposition program, matches phrases in a human-written summary sentence to phrases in the original document (jing and mckeown, 1999). the human-written abstracts were collected from the free daily news service "communicationsrelated headlines", provided by the benton foundation (http://www.benton.org). the articles in the corpus are news reports on telecommunication related issues, but they cover a wide range of topics, such as law, labor, and company mergers. database to date. it provides lexical relations between words, including synonymy, antonymy, meronymy, entailment (e.g., eat —> chew), or causation (e.g., kill --* die). these lexical links are used to identify the focus in the local context. (4) the syntactic parser. we use the english slot grammar(esg) parser developed at ibm (mccord, 1990) to analyze the syntactic structure of an input sentence and produce a sentence parse tree. the esg parser not only annotates the syntactic category of a phrase (e.g., "np" or "vp"), it also annotates the thematic role of a phrase (e.g., "subject" or "object"). there are five steps in the reduction program: step 1: syntactic parsing. we first parse the input sentence using the esg parser and produce the sentence parse tree. the operations in all other steps are performed based on this parse tree. each following step annotates each node in the parse tree with additional information, such as syntactic or context importance, which are used later to determine which phrases (they are represented as subtrees in a parse tree) can be considered extraneous and thus removed. step 2: grammar checking. in this step, we determine which components of a sentence must not be deleted to keep the sentence grammatical. to do this, we traverse the parse tree produced in the first step in top-down order and mark, for each node in the parse tree, which of its children are grammatically obligatory. we use two sources of knowledge for this purpose. one source includes simple, linguistic-based rules that use the thematic role structure produced by the esg parser. for instance, for a sentence, the main verb, the subject, and the object(s) are essential if they exist, but a prepositional phrase is not; for a noun phrase, the head noun is essential, but an adjective modifier of the head noun is not. the other source we rely on is the large-scale lexicon we described earlier. the information in the lexicon is used to mark the obligatory arguments of verb phrases. for example, for the verb "convince", the lexicon has the following entry: this entry indicates that the verb "convince" can be followed by a noun phrase and a prepositional phrase starting with the preposition "of' (e.g., he convinced me of his innocence). it can also be followed by a noun phrase and a to-infinitive phrase (e.g., he convinced me to go to the party). this information prevents the system from deleting the "of" prepositional phrase or the to-infinitive that is part of the verb phrase. at the end of this step, each node in the parse tree — including both leaf nodes and intermediate nodes — is annotated with a value indicating whether it is grammatically obligatory. note that whether a node is obligatory is relative to its parent node only. for example, whether a determiner is obligatory is relative to the noun phrase it is in; whether a prepositional phrase is obligatory is relative to the sentence or the phrase it is in. step 3: context information. in this step, the system decides which components in the sentence are most related to the main topic being discussed. to measure the importance of a phrase in the local context, the system relies on lexical links between words. the hypothesis is that the more connected a word is with other words in the local context, the more likely it is to be the focus of the local context. we link the words in the extracted sentence with words in its local context, if they are repetitions, morphologically related, or linked in wordnet through one of the lexical relations. the system then computes an importance score for each word in the extracted sentence, based on the number of links it has with other words and the types of links. the formula for computing the context importance score for a word w is as follows: here, i represents the different types of lexical relations the system considered, including repetition, inflectional relation, derivational relation, and the lexical relations from wordnet. we assigned a weight to each type of lexical relation, represented by li in the formula. relations such as repetition or inflectional relation are considered more important and are assigned higher weights, while relations such as hypernym are considered less important and assigned lower weights. nu (w) in the formula represents the number of a particular type of lexical links the word w has with words in the local context. after an importance score is computed for each word, each phrase in the 'sentence gets a score by adding up the scores of its children nodes in the parse tree. this score indicates how important the phrase is in the local context. step 4: corpus evidence. the program uses a corpus consisting of sentences reduced by human professionals and their corresponding original sentences to compute how likely humans remove a certain phrase. the system first parsed the sentences in the corpus using esg parser. it then marked which subtrees in these parse trees (i.e., phrases in the sentences) were removed by humans. using this corpus of marked parse trees, we can compute how likely a subtree is removed from its parent node. for example, we can compute the probability that the "when" temporal clause is removed when the main verb is "give", represented as prob("when-clause is removed" i "v=give"), or the probability that the to-infinitive modifier of the head noun "device" is removed, represented as prob("to-infinitive modifier is removed" i"n=device"). these probabilities are computed using bayes's rule. for example, the probability that the "when" temporal clause is removed when the main verb is "give", prob("when-clause is removed" i "v=give"), is computed as the product of prob( "v=give" i "when-clause is removed") (i.e., the probability that the main verb is "give" when the "when" clause is removed) and prob("when-clause is removed") (i.e., the probability that the "when" clause is removed), divided by prob("v=give") (i.e., the probability that the main verb is "give"). besides computing the probability that a phrase is removed, we also compute two other types of probabilities: the probability that a phrase is reduced (i.e., the phrase is not removed as a whole, but some components in the phrase are removed), and the probability that a phrase is unchanged at all (i.e., neither removed nor reduced). these corpus probabilities help us capture human practice. for example, for sentences like "the agency reported that ..." , "the other source says that ..." , "the new study suggests that ..." , the thatclause following the say-verb (i.e., report, say, and suggest) in each sentence is very rarely changed at all by professionals. the system can capture this human practice, since the probability that that-clause of the verb say or report being unchanged at all will be relatively high, which will help the system to avoid removing components in the that-clause. these corpus probabilities are computed beforehand using a training corpus. they are then stored in a table and loaded at running time. step 5: final decision. the final reduction decisions are based on the results from all the earlier steps. to decide which phrases to remove, the system traverses the sentence parse tree, which now have been annotated with different types of information from earlier steps, in the top-down order and decides which subtrees should be removed, reduced or unchanged. a subtree (i.e., a phrase) is removed only if it is not grammatically obligatory, not the focus of the local context (indicated by a low importance score), and has a reasonable probability of being removed by humans. figure 1 shows sample output of the reduction program. the reduced sentences produced by humans are also provided for comparison.the reduced sentences produced by humans are also provided for comparison. current automatic summarizers usually rely on sentence extraction to produce summaries. figure 1 shows sample output of the reduction program. any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the national science foundation. it is one of the most effective operations that can be used to edit the extracted sentences. the final reduction decisions are based on the results from all the earlier steps. we call the operation of removing extraneous phrases from an extracted sentence sentence reduction. reduction can remove material at any granularity: a word, a prepositional phrase, a gerund, a to-infinitive or a clause. we analyzed a set of articles and identified six major operations that can be used for editing the extracted sentences, including removing extraneous phrases from an extracted sentence, combining a reduced sentence with other sentences, syntactic transformation, substituting phrases in an extracted sentence with their paraphrases, substituting phrases with more general or specific descriptions, and reordering the extracted sentences (jing and mckeown, 1999; jing and mckeown, 2000). step 5: final decision. they are then stored in a table and loaded at running time. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2004.txt b/DATASET_PACSUM/dataset/inputs/A00-2004.txt new file mode 100644 index 0000000000000000000000000000000000000000..669fb86f4d9642216e656b85bbbb53728626ac7b --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2004.txt @@ -0,0 +1 @@ +even moderately long documents typically address several topics or different aspects of the same topic. the aim of linear text segmentation is to discover the topic boundaries. the uses of this procedure include information retrieval (hearst and plaunt, 1993; hearst, 1994; yaari, 1997; reynar, 1999), summarization (reynar, 1998), text understanding, anaphora resolution (kozima, 1993), language modelling (morris and hirst, 1991; beeferman et al., 1997b) and improving document navigation for the visually disabled (choi, 2000). this paper focuses on domain independent methods for segmenting written text. we present a new algorithm that builds on previous work by reynar (reynar, 1998; reynar, 1994). the primary distinction of our method is the use of a ranking scheme and the cosine similarity measure (van rijsbergen, 1979) in formulating the similarity matrix. we propose that the similarity values of short text segments is statistically insignificant. thus, one can only rely on their order, or rank, for clustering.even moderately long documents typically address several topics or different aspects of the same topic. a segmentation algorithm has two key elements, a, clustering strategy and a similarity measure. we would also like to develop a linear time and multi-source version of the algorithm. thus, one can only rely on their order, or rank, for clustering. the significance of our results has been confirmed by both t-test and ks-test. the definition of a topic segment ranges from complete stories (allan et al., 1998) to summaries (ponte and croft, 1997). given the quality of an algorithm is task dependent, the following experiments focus on the relative performance. c99, k98 and r98 are all polynomial time algorithms. existing work falls into one of two categories, lexical cohesion methods and multi-source methods (yaari, 1997). it would be interesting to compare c99 with the multi-source method described in (beeferman et al., 1999) using the tdt corpus. if one disregards segmentation accuracy, h94 has the best algorithmic performance (linear). our evaluation strategy is a variant of that described in (reynar, 1998, 71-73) and the tdt segmentation task (allan et al., 1998). our results show divisive clustering (r98) is more precise than sliding window (h94) and lexical chains (k98) for locating topic boundaries. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2009.txt b/DATASET_PACSUM/dataset/inputs/A00-2009.txt new file mode 100644 index 0000000000000000000000000000000000000000..92a6dbd149ea70ea9b671f8905515d8c551452d4 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2009.txt @@ -0,0 +1 @@ +word sense disambiguation is often cast as a problem in supervised learning, where a disambiguator is induced from a corpus of manually sense—tagged text using methods from statistics or machine learning. these approaches typically represent the context in which each sense—tagged instance of a word occurs with a set of linguistically motivated features. a learning algorithm induces a representative model from these features which is employed as a classifier to perform disambiguation. this paper presents a corpus—based approach that results in high accuracy by combining a number of very simple classifiers into an ensemble that performs disambiguation via a majority vote. this is motivated by the observation that enhancing the feature set or learning algorithm used in a corpus—based approach does not usually improve disambiguation accuracy beyond what can be attained with shallow lexical features and a simple supervised learning algorithm. for example, a naive bayesian classifier (duda and hart, 1973) is based on a blanket assumption about the interactions among features in a sensetagged corpus and does not learn a representative model. despite making such an assumption, this proves to be among the most accurate techniques in comparative studies of corpus—based word sense disambiguation methodologies (e.g., (leacock et al., 1993), (mooney, 1996), (ng and lee, 1996), (pedersen and bruce, 1997)). these studies represent the context in which an ambiguous word occurs with a wide variety of features. however, when the contribution of each type of feature to overall accuracy is analyzed (eg. (ng and lee, 1996)), shallow lexical features such as co—occurrences and collocations prove to be stronger contributors to accuracy than do deeper, linguistically motivated features such as part—of—speech and verb—object relationships. it has also been shown that the combined accuracy of an ensemble of multiple classifiers is often significantly greater than that of any of the individual classifiers that make up the ensemble (e.g., (dietterich, 1997)). in natural language processing, ensemble techniques have been successfully applied to part— of—speech tagging (e.g., (brill and wu, 1998)) and parsing (e.g., (henderson and brill, 1999)). when combined with a history of disambiguation success using shallow lexical features and naive bayesian classifiers, these findings suggest that word sense disambiguation might best be improved by combining the output of a number of such classifiers into an ensemble. this paper begins with an introduction to the naive bayesian classifier. the features used to represent the context in which ambiguous words occur are presented, followed by the method for selecting the classifiers to include in the ensemble. then, the line and interesi data is described. experimental results disambiguating these words with an ensemble of naive bayesian classifiers are shown to rival previously published results. this paper closes with a discussion of the choices made in formulating this methodology and plans for future work.this work extends ideas that began in collaboration with rebecca bruce and janyce wiebe. a preliminary version of this paper appears in (pedersen, 2000). word sense disambiguation is often cast as a problem in supervised learning, where a disambiguator is induced from a corpus of manually sense—tagged text using methods from statistics or machine learning. this paper closes with a discussion of the choices made in formulating this methodology and plans for future work. each of the nine member classifiers votes for the most probable sense given the particular context represented by that classifier; the ensemble disambiguates by assigning the sense that receives a majority of the votes. a naive bayesian classifier assumes that all the feature variables representing a problem are conditionally independent given the value of a classification variable. these approaches typically represent the context in which each sense—tagged instance of a word occurs with a set of linguistically motivated features. this approach was evaluated using the widely studied nouns line and interest, which are disambiguated with accuracy of 88% and 89%, which rivals the best previously published results. experimental results disambiguating these words with an ensemble of naive bayesian classifiers are shown to rival previously published results. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2018.txt b/DATASET_PACSUM/dataset/inputs/A00-2018.txt new file mode 100644 index 0000000000000000000000000000000000000000..720d45d603be548f99734ecfc6d31a53c136d7cd --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2018.txt @@ -0,0 +1 @@ +we present a new parser for parsing down to penn tree-bank style parse trees [16] that achieves 90.1% average precision/recall for sentences of length < 40, and 89.5% for sentences of length < 100, when trained and tested on the previously established [5,9,10,15,17] "standard" sections of the wall street journal tree-bank. this represents a 13% decrease in error rate over the best single-parser results on this corpus [9]. following [5,10], our parser is based upon a probabilistic generative model. that is, for all sentences s and all parses 7r, the parser assigns a probability p(s , 7r) = p(r), the equality holding when we restrict consideration to 7r whose yield * this research was supported in part by nsf grant lis sbr 9720368. the author would like to thank mark johnson and all the rest of the brown laboratory for linguistic information processing. is s. then for any s the parser returns the parse ir that maximizes this probability. that is, the parser implements the function arg maxrp(7r s) = arg maxirp(7r, s) = arg maxrp(w). what fundamentally distinguishes probabilistic generative parsers is how they compute p(r), and it is to that topic we turn next.it is to this project that our future parsing work will be devoted. what fundamentally distinguishes probabilistic generative parsers is how they compute p(r), and it is to that topic we turn next. we present a new parser for parsing down to penn tree-bank style parse trees [16] that achieves 90.1% average precision/recall for sentences of length < 40, and 89.5% for sentences of length < 100, when trained and tested on the previously established [5,9,10,15,17] "standard" sections of the wall street journal tree-bank. indeed, we initiated this line of work in an attempt to create a parser that would be flexible enough to allow modifications for parsing down to more semantic levels of detail. we have presented a lexicalized markov grammar parsing model that achieves (using the now standard training/testing/development sections of the penn treebank) an average precision/recall of 91.1% on sentences of length < 40 and 89.5% on sentences of length < 100. this corresponds to an error reduction of 13% over the best previously published single parser results on this test set, those of collins [9]. in the previous sections we have concentrated on the relation of the parser to a maximumentropy approach, the aspect of the parser that is most novel. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2019.txt b/DATASET_PACSUM/dataset/inputs/A00-2019.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168abfb7b9ddeba1b19d9739aa28cd22c84b766 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2019.txt @@ -0,0 +1 @@ +a good indicator of whether a person knows the meaning of a word is the ability to use it appropriately in a sentence (miller and gildea, 1987). much information about usage can be obtained from quite a limited context: choueka and lusignan (1985) found that people can typically recognize the intended sense of a polysemous word by looking at a narrow window of one or two words around it. statistically-based computer programs have been able to do the same with a high level of accuracy (kilgarriff and palmer, 2000). the goal of our work is to automatically identify inappropriate usage of specific vocabulary words in essays by looking at the local contextual cues around a target word. we have developed a statistical system, alek (assessing lexical knowledge), that uses statistical analysis for this purpose. a major objective of this research is to avoid the laborious and costly process of collecting errors (or negative evidence) for each word that we wish to evaluate. instead, we train alek on a general corpus of english and on edited text containing example uses of the target word. the system identifies inappropriate usage based on differences between the word's local context cues in an essay and the models of context it has derived from the corpora of well-formed sentences. a requirement for alek has been that all steps in the process be automated, beyond choosing the words to be tested and assessing the results. once a target word is chosen, preprocessing, building a model of the word's appropriate usage, and identifying usage errors in essays is performed without manual intervention. alek has been developed using the test of english as a foreign language (toefl) administered by the educational testing service. toefl is taken by foreign students who are applying to us undergraduate and graduate-level programs.toefl is taken by foreign students who are applying to us undergraduate and graduate-level programs. a good indicator of whether a person knows the meaning of a word is the ability to use it appropriately in a sentence (miller and gildea, 1987). the unsupervised techniques that we have presented for inferring negative evidence are effective in recognizing grammatical errors in written text. however, its techniques could be incorporated into a grammar checker for native speakers. approaches to detecting errors by non-native writers typically produce grammars that look for specific expected error types (schneider and mccoy, 1998; park, palmer and washburn, 1997). the problem of error detection does not entail finding similarities to appropriate usage, rather it requires identifying one element among the contextual cues that simply does not fit. alek has been developed using the test of english as a foreign language (toefl) administered by the educational testing service. under this approach, essays written by esl students are collected and examined for errors. this system was tested on eight essays, but precision and recall figures are not reported. an incorrect usage can contain two or three salient contextual elements as well as a single anomalous element. comparison of these results to those of other systems is difficult because there is no generally accepted test set or performance baseline. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2024.txt b/DATASET_PACSUM/dataset/inputs/A00-2024.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f102ba1e8e4a805840782c366b4651accda58e8 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2024.txt @@ -0,0 +1 @@ +there is a big gap between the summaries produced by current automatic summarizers and the abstracts written by human professionals. certainly one factor contributing to this gap is that automatic systems can not always correctly identify the important topics of an article. another factor, however, which has received little attention, is that automatic summarizers have poor text generation techniques. most automatic summarizers rely on extracting key sentences or paragraphs from an article to produce a summary. since the extracted sentences are disconnected in the original article, when they are strung together, the resulting summary can be inconcise, incoherent, and sometimes even misleading. we present a cut and paste based text summarization technique, aimed at reducing the gap between automatically generated summaries and human-written abstracts. rather than focusing on how to identify key sentences, as do other researchers, we study how to generate the text of a summary once key sentences have been extracted. the main idea of cut and paste summarization is to reuse the text in an article to generate the summary. however, instead of simply extracting sentences as current summarizers do, the cut and paste system will "smooth" the extracted sentences by editing them. such edits mainly involve cutting phrases and pasting them together in novel ways. the key features of this work are:there is a big gap between the summaries produced by current automatic summarizers and the abstracts written by human professionals. the key features of this work are: any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the national science foundation. we thank ibm for licensing us the esg parser and the mitre corporation for licensing us the coreference resolution system. finally, we conclude and discuss future work. we will also extend the system to query-based summarization and investigate whether the system can be modified for multiple document summarization. this paper presents a novel architecture for text summarization using cut and paste techniques observed in human-written abstracts. ing operations. related work is discussed in section 6. we identified six operations that can be used alone or together to transform extracted sentences into sentences in human-written abstracts. (mani et al., 1999) addressed the problem of revising summaries to improve their quality. however, the combination operations and combination rules that we derived from corpus analysis are significantly different from those used in the above system, which mostly came from operations in traditional natural language generation. such edits mainly involve cutting phrases and pasting them together in novel ways. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2026.txt b/DATASET_PACSUM/dataset/inputs/A00-2026.txt new file mode 100644 index 0000000000000000000000000000000000000000..688af7220904933ba37b9419270afb3390f892a5 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2026.txt @@ -0,0 +1 @@ +this paper presents three trainable systems for surface natural language generation (nlg). surface nlg, for our purposes, consists of generating a grammatical natural language phrase that expresses the meaning of an input semantic representation. the systems take a "corpus-based" or "machinelearning" approach to surface nlg, and learn to generate phrases from semantic input by statistically analyzing examples of phrases and their corresponding semantic representations. the determination of the content in the semantic representation, or "deep" generation, is not discussed here. instead, the systems assume that the input semantic representation is fixed and only deal with how to express it in natural language. this paper discusses previous approaches to surface nlg, and introduces three trainable systems for surface nlg, called nlg1, nlg2, and nlg3. quantitative evaluation of experiments in the air travel domain will also be discussed.this paper presents three trainable systems for surface natural language generation (nlg). quantitative evaluation of experiments in the air travel domain will also be discussed. this paper presents the first systems (known to the author) that use a statistical learning approach to produce natural language text directly from a semantic representation. we conjecture that nlg2 and nlg3 should work in other domains which have a complexity similar to air travel, as well as available annotated data. the nlg2 and nlg3 systems automatically attempt to generalize from the knowledge inherent in the training corpus of templates, so that they can generate templates for novel attribute sets. in contrast, (langkilde and knight, 1998) uses corpus-derived statistical knowledge to rank plausible hypotheses from a grammarbased surface generation component. templates are the easiest way to implement surface nlg. this limitation can be overcome by using features on values, so that nlg2 and nlg3 might discover — to use a hypothetical example — that "flights leaving $city-fr" is preferred over "flights from $city-fr" when $city-fr is a particular value, such as "miami". our current approach has the limitation that it ignores the values of attributes, even though they might strongly influence the word order and word choice. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2030.txt b/DATASET_PACSUM/dataset/inputs/A00-2030.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5f528079efc7a6c217f0c6f23751039f615956a --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2030.txt @@ -0,0 +1 @@ +since 1995, a few statistical parsing algorithms (magerman, 1995; collins, 1996 and 1997; charniak, 1997; rathnaparki, 1997) demonstrated a breakthrough in parsing accuracy, as measured against the university of pennsylvania treebank as a gold standard. yet, relatively few have embedded one of these algorithms in a task. chiba, (1999) was able to use such a parsing algorithm to reduce perplexity with the long term goal of improved speech recognition. in this paper, we report adapting a lexicalized, probabilistic context-free parser with head rules (lpcfg-hr) to information extraction. the technique was benchmarked in the seventh message understanding conference (muc-7) in 1998. several technical challenges confronted us and were solved: treebank on wall street journal adequately train the algorithm for new york times newswire, which includes dozens of newspapers? manually creating sourcespecific training data for syntax was not required. instead, our parsing algorithm, trained on the upenn treebank, was run on the new york times source to create unsupervised syntactic training which was constrained to be consistent with semantic annotation.this simple semantic annotation was the only source of task knowledge used to configure the model. we have demonstrated, at least for one problem, that a lexicalized, probabilistic context-free parser with head rules (lpcfghr) can be used effectively for information extraction. instead, our parsing algorithm, trained on the upenn treebank, was run on the new york times source to create unsupervised syntactic training which was constrained to be consistent with semantic annotation. while performance did not quite match the best previously reported results for any of these three tasks, we were pleased to observe that the scores were at or near state-of-the-art levels for all cases. since 1995, a few statistical parsing algorithms (magerman, 1995; collins, 1996 and 1997; charniak, 1997; rathnaparki, 1997) demonstrated a breakthrough in parsing accuracy, as measured against the university of pennsylvania treebank as a gold standard. we evaluated the new approach to information extraction on two of the tasks of the seventh message understanding conference (muc-7) and reported in (marsh, 1998). our system for muc-7 consisted of the sentential model described in this paper, coupled with a simple probability model for cross-sentence merging. for the following example, the template relation in figure 2 was to be generated: "donald m. goldstein, a historian at the university of pittsburgh who helped write..." \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2031.txt b/DATASET_PACSUM/dataset/inputs/A00-2031.txt new file mode 100644 index 0000000000000000000000000000000000000000..401941f1962d27df8b2fcaa6a8f2ef2ea4c9ec06 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2031.txt @@ -0,0 +1 @@ +parsing sentences using statistical information gathered from a treebank was first examined a decade ago in (chitrad and grishman, 1990) and is by now a fairly well-studied problem ((charniak, 1997), (collins, 1997), (ratnaparkhi, 1997)). but to date, the end product of the parsing process has for the most part been a bracketing with simple constituent labels like np, vp, or sbar. the penn treebank contains a great deal of additional syntactic and semantic information from which to gather statistics; reproducing more of this information automatically is a goal which has so far been mostly ignored. this paper details a process by which some of this information—the function tags— may be recovered automatically. in the penn treebank, there are 20 tags (figure 1) that can be appended to constituent labels in order to indicate additional information about the syntactic or semantic role of the constituent. we have divided them into four categories (given in figure 2) based on those in the bracketing guidelines (bies et al., 1995). a constituent can be tagged with multiple tags, but never with two tags from the same category.1 in actuality, the case where a constituent has tags from all four categories never happens, but constituents with three tags do occur (rarely). at a high level, we can simply say that having the function tag information for a given text is useful just because any further information would help. but specifically, there are distinct advantages for each of the various categories. grammatical tags are useful for any application trying to follow the thread of the text—they find the 'who does what' of each clause, which can be useful to gain information about the situation or to learn more about the behaviour of the words in the sentence. the form/function tags help to find those constituents behaving in ways not conforming to their labelled type, as well as further clarifying the behaviour of adverbial phrases. information retrieval applications specialising in describing events, as with a number of the muc applications, could greatly benefit from some of these in determining the where-when-why of things. noting a topicalised constituent could also prove useful to these applications, and it might also help in discourse analysis, or pronoun resolution. finally, the 'miscellaneous' tags are convenient at various times; particularly the clr 'closely related' tag, which among other things marks phrasal verbs and prepositional ditransitives. to our knowledge, there has been no attempt so far to recover the function tags in parsing treebank text. in fact, we know of only one project that used them at all: (collins, 1997) defines certain constituents as complements based on a combination of label and function tag information. this boolean condition is then used to train an improved parser.this boolean condition is then used to train an improved parser. this work presents a method for assigning function tags to text that has been parsed to the simple label level. • there is no reason to think that this work could not be integrated directly into the parsing process, particularly if one's parser is already geared partially or entirely towards feature-based statistics; the function tag information could prove quite useful within the parse itself, to rank several parses to find the most plausible. parsing sentences using statistical information gathered from a treebank was first examined a decade ago in (chitrad and grishman, 1990) and is by now a fairly well-studied problem ((charniak, 1997), (collins, 1997), (ratnaparkhi, 1997)). but to date, the end product of the parsing process has for the most part been a bracketing with simple constituent labels like np, vp, or sbar. in fact, we know of only one project that used them at all: (collins, 1997) defines certain constituents as complements based on a combination of label and function tag information. there are, it seems, two reasonable baselines for this and future work. we have found it useful to define our statistical model in terms of features. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A00-2034.txt b/DATASET_PACSUM/dataset/inputs/A00-2034.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ed6b870ab87336b4c9f27d98163c76f56399ac5 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A00-2034.txt @@ -0,0 +1 @@ +diathesis alternations are alternate ways in which the arguments of a verb are expressed syntactically. the syntactic changes are sometimes accompanied by slight changes in the meaning of the verb. an example of the causative alternation is given in (1) below. in this alternation, the object of the transitive variant can also appear as the subject of the intransitive variant. in the conative alternation, the transitive form alternates with a prepositional phrase construction involving either at or on. an example of the conative alternation is given in (2). we refer to alternations where a particular semantic role appears in different grammatical roles in alternate realisations as "role switching alternations" (rsas). it is these alternations that our method applies to. recently, there has been interest in corpus-based methods to identify alternations (mccarthy and korhonen, 1998; lapata, 1999), and associated verb classifications (stevenson and merlo, 1999). these have either relied on a priori knowledge specified for the alternations in advance, or are not suitable for a wide range of alternations. the fully automatic method outlined here is applied to the causative and conative alternations, but is applicable to other rsas.however, a considerably larger corpus would be required to overcome the sparse data problem for other rsa alternations. we have discovered a significant relationship between the similarity of selectional preferences at the target slots, and participation in the causative and conative alternations. diathesis alternations are alternate ways in which the arguments of a verb are expressed syntactically. the fully automatic method outlined here is applied to the causative and conative alternations, but is applicable to other rsas. we propose a method to acquire knowledge of alternation participation directly from corpora, with frequency information available as a by-product. notably, only one negative decision was made because of the disparate frame frequencies, which reduces the cost of combining the argument head data. diathesis alternations have been proposed for a number of nlp tasks. earlier work by resnik (1993) demonstrated a link between selectional preference strength and participation in alternations where the direct object is omitted. the syntactic changes are sometimes accompanied by slight changes in the meaning of the verb. these have either relied on a priori knowledge specified for the alternations in advance, or are not suitable for a wide range of alternations. for the conative, a sample of 16 verbs was used and this time accuracy was only 56%. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A88-1019.txt b/DATASET_PACSUM/dataset/inputs/A88-1019.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0c5799f5d1167b6d56ca28177643438d841b628 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A88-1019.txt @@ -0,0 +1 @@ +it is well-known that part of speech depends on context. the word "table," for example, can be a verb in some contexts (e.g., "he will table the motion") and a noun in others (e.g., "the table is ready"). a program has been written which tags each word in an input sentence with the most likely part of speech. the program produces the following output for the two "table" sentences just mentioned: (pps = subject pronoun; md = modal; vb = verb (no inflection); at = article; nn = noun; bez = present 3rd sg form of "to be"; jj = adjective; notation is borrowed from [francis and kucera, pp. 6-8]) part of speech tagging is an important practical problem with potential applications in many areas including speech synthesis, speech recognition, spelling correction, proof-reading, query answering, machine translation and searching large text data bases (e.g., patents, newspapers). the author is particularly interested in speech synthesis applications, where it is clear that pronunciation sometimes depends on part of speech. consider the following three examples where pronunciation depends on part of speech. first, there are words like "wind" where the noun has a different vowel than the verb. that is, the noun "wind" has a short vowel as in "the wind is strong," whereas the verb "wind" has a long vowel as in "don't forget to wind your watch." secondly, the pronoun "that" is stressed as in "did you see that?" unlike the complementizer "that," as in "it is a shame that he's leaving." thirdly, note the difference between "oily fluid" and "transmission fluid"; as a general rule, an adjective-noun sequence such as "oily fluid" is typically stressed on the right whereas a noun-noun sequence such as "transmission fluid" is typically stressed on the left. these are but three of the many constructions which would sound more natural if the synthesizer had access to accurate part of speech information. perhaps the most important application of tagging programs is as a tool for future research. a number of large projects such as [cobuild] have recently been collecting large corpora (101000 million words) in order to better describe how language is actually used in practice: "for the first time, a dictionary has been compiled by the thorough examination of representative group of english texts, spoken and written, running to many millions of words. this means that in addition to all the tools of the conventional dictionary makers... the dictionary is based on hard, measureable evidence." [cobuild, p. xv] it is likely that there will be more and more research projects collecting larger and larger corpora. a reliable parts program might greatly enhance the value of these corpora to many of these researchers. the program uses a linear time dynamic programming algorithm to find an assignment of parts of speech to words that optimizes the product of (a) lexical probabilities (probability of observing part of speech i given word j), and (b) contextual probabilities (probability of observing part of speech i given k previous parts of speech). probability estimates were obtained by training on the tagged brown corpus [francis and kucera], a corpus of approximately 1,000,000 words with part of speech tags assigned laboriously by hand over many years. program performance is encouraging (95-99% "correct", depending on the definition of "correct"). a small 400 word sample is presented in the appendix, and is judged to be 99.5% correct. it is surprising that a local "bottom-up" approach can perform so well. most errors are attributable to defects in the lexicon; remarkably few errors are related to the inadequacies of the extremely over-simplified grammar (a trigram model). apparently, "long distance" dependences are not very important, at least most of the time. one might have thought that ngram models weren't adequate for the task since it is wellknown that they are inadequate for determining grammaticality: "we find that no finite-state markov process that produces symbols with transition from state to state can serve as an english grammar. furthermore, the particular subclass of such processes that produce norder statistical approximations to english do not come closer, with increasing n, to matching the output of an english grammar." [chomsky, p. 113] chomslcy's conclusion was based on the observation that constructions such as: have long distance dependencies that span across any fixed length window n. thus, ngram models are clearly inadequate for many natural language applications. however, for the tagging application, the ngram approximation may be acceptable since long distance dependencies do not seem to be very important. statistical ngram models were quite popular in the 1950s, and have been regaining popularity over the past few years. the ibm speech group is perhaps the strongest advocate of ngram methods, especially in other applications such as speech recognition. robert mercer (private communication, 1982) has experimented with the tagging application, using a restricted corpus (laser patents) and small vocabulary (1000 words). another group of researchers working in lancaster around the same time, leech, garside and atwell, also found ngram models highly effective; they report 96.7% success in automatically tagging the lob corpus, using a bigram model modified with heuristics to cope with more important trigrams. the present work developed independently from the lob project. many people who have not worked in computational linguistics have a strong intuition that lexical ambiguity is usually not much of a problem. it is commonly believed that most words have just one part of speech, and that the few exceptions such as "table" are easily disambiguated by context in most cases. in contrast, most experts in computational linguists have found lexical ambiguity to be a major issue; it is said that practically any content word can be used as a noun, verb or adjective,i and that local context is not always adequate to disambiguate. introductory texts are full of ambiguous sentences such as where no amount of syntactic parsing will help. these examples are generally taken to indicate that the parser must allow for multiple possibilities and that grammar formalisms such as lr(k) are inadequate for natural language since these formalisms cannot cope with ambiguity. this argument was behind a large set of objections to marcus' "lr(k)-like" deterministic parser. although it is clear that an expert in computational linguistics can dream up arbitrarily hard sentences, it may be, as marcus suggested, that most texts are not very hard in practice. recall that marcus hypothesized most decisions can be resolved by the parser within a small window (i.e., three buffer cells), and there are only a few problematic cases where the parser becomes confused. he called these confusing cases "garden paths," by analogy with the famous example: • the horse raced past the barn fell. with just a few exceptions such as these "garden paths," marcus assumes, there is almost always a unique "best" interpretation which can be found with very limited resources. the proposed stochastic approach is largely compatible with this; the proposed approach 1. from an information theory point of view, one can quantity ambiguity in bits. in the case of the brown tagged corpus, the lexical entropy, the conditional entropy of the part of speech given the word is about 0.25 bits per part of speech. this is considerably smaller than the contextual entropy, the conditional entropy of the part of speech given the next two parts of speech. this entropy is estimated to be about 2 bits per part of speech. assumes that it is almost always sufficient to assign each word a unique "best" part of speech (and this can be accomplished with a very efficient linear time dynamic programming algorithm). after reading introductory discussions of "flying planes can be dangerous," one might have expected that lexical ambiguity was so pervasive that it would be hopeless to try to assign just one part of speech to each word and in just one linear time pass over the input words.the proposed method omitted only 5 of 243 noun phrase brackets in the appendix. it is well-known that part of speech depends on context. find all assignments of parts of speech to "a" and score. after reading introductory discussions of "flying planes can be dangerous," one might have expected that lexical ambiguity was so pervasive that it would be hopeless to try to assign just one part of speech to each word and in just one linear time pass over the input words. the word "table," for example, can be a verb in some contexts (e.g., "he will table the motion") and a noun in others (e.g., "the table is ready"). this entropy is estimated to be about 2 bits per part of speech. assumes that it is almost always sufficient to assign each word a unique "best" part of speech (and this can be accomplished with a very efficient linear time dynamic programming algorithm). this is considerably smaller than the contextual entropy, the conditional entropy of the part of speech given the next two parts of speech. there is some tendency to underestimate the number of brackets and run two noun phrases together as in [np the time fairchild]. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A92-1006.txt b/DATASET_PACSUM/dataset/inputs/A92-1006.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdb6b28bd8baebe69003fc7ef1fe53dbdeb80b06 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A92-1006.txt @@ -0,0 +1 @@ +this paper presents the joyce system as an example of a fully-implemented, application-oriented text generation system. joyce covers the whole range of tasks associated with text generation, from content selection to morphological processing. it was developped as part of the interface of the software design environment ulysses. the following design goals were set for it: while we were able to exploit existing research for many of the design issues, it turned out that we needed to develop our own approach to text planning (ra.mbow 1990). this paper will present the system and attempt to show how these design objectives led to particular design decisions. the structure of the paper is as follows. in section 2, we will present the underlying application and give examples of the output of the system. in section 3, we will discuss the overall structure of joyce. we then discuss the three main components in turn: the text planner in section 4, the sentence planner in section 5 and the realizer in section 6. we will discuss the text planner in some detail since it represents a new approach to the problem. section 7 traces the generation of a short text. in section 8, we address the problem of portability, and wind up by discussing some shortcomings of joyce in the conclusion.in section 8, we address the problem of portability, and wind up by discussing some shortcomings of joyce in the conclusion. this paper presents the joyce system as an example of a fully-implemented, application-oriented text generation system. we are aware of several shortcomings of joyce, which we will address in future versions of the system. ple in text planning, it appears to play an important role as a constraint on possible text structures. it passes it through the incrementor to the formater, which downgrades it when a classified corrected reading leaves through p34. ii has met the design objectives of speed and quality, and our experience in porting the text generator to new task: and to new applications indicates that joyce is a flexibl( system that can adapt to a variety of text generatior tasks. initial results, including a prototype, are encouraging. porting is an important way to evaluate complete applied text generation systems, since there is no canonical set of tasks that such a system must be able to perform and on which it can be tested. the analyzer downgrades it to secret. furthermore, it helps determine the use of connectives between rhetorically related clauses. the joyce text generation system was developped part of the software design environment ulysses (korelsky and ulysses staff 1988; rosenthal et al 1988) ulysses includes a graphical environment for the design of secure, distributed software systems. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A92-1018.txt b/DATASET_PACSUM/dataset/inputs/A92-1018.txt new file mode 100644 index 0000000000000000000000000000000000000000..517a426569fc72c57ab73f55fb45a20ceb0849ce --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A92-1018.txt @@ -0,0 +1 @@ +many words are ambiguous in their part of speech. for example, "tag" can be a noun or a verb. however, when a word appears in the context of other words, the ambiguity is often reduced: in "a tag is a part-of-speech label," the word "tag" can only be a noun. a part-of-speech tagger is a system that uses context to assign parts of speech to words. automatic text tagging is an important first step in discovering the linguistic structure of large text corpora. part-of-speech information facilitates higher-level analysis, such as recognizing noun phrases and other patterns in text. for a tagger to function as a practical component in a language processing system, we believe that a tagger must be: robust text corpora contain ungrammatical constructions, isolated phrases (such as titles), and nonlinguistic data (such as tables). corpora are also likely to contain words that are unknown to the tagger. it is desirable that a tagger deal gracefully with these situations. efficient if a tagger is to be used to analyze arbitrarily large corpora, it must be efficient—performing in time linear in the number of words tagged. any training required should also be fast, enabling rapid turnaround with new corpora and new text genres. accurate a tagger should attempt to assign the correct part-of-speech tag to every word encountered. tunable a tagger should be able to take advantage of linguistic insights. one should be able to correct systematic errors by supplying appropriate a priori "hints." it should be possible to give different hints for different corpora. reusable the effort required to retarget a tagger to new corpora, new tagsets, and new languages should be minimal.reusable the effort required to retarget a tagger to new corpora, new tagsets, and new languages should be minimal. many words are ambiguous in their part of speech. one should be able to correct systematic errors by supplying appropriate a priori "hints." it should be possible to give different hints for different corpora. the algorithm has an accuracy of approximately 80% in assigning grammatical functions. we have used the tagger in a number of applications. by using the fact that words are typically associated with only a few part-ofspeech categories, and carefully ordering the computation, the algorithms have linear complexity (section 3.3). for example, "tag" can be a noun or a verb. several different approaches have been used for building text taggers. probabilities corresponding to category sequences that never occurred in the training data are assigned small, non-zero values, ensuring that the model will accept any sequence of tokens, while still providing the most likely tagging. we describe three applications here: phrase recognition; word sense disambiguation; and grammatical function assignment. if a noun phrase is labeled, it is also annotated as to whether the governing verb is the closest verb group to the right or to the left. taggit disambiguated 77% of the corpus; the rest was done manually over a period of several years. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A92-1021.txt b/DATASET_PACSUM/dataset/inputs/A92-1021.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c0f5dd5d36812f04875649e1aeb4e5a21fc02eb --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A92-1021.txt @@ -0,0 +1 @@ +there has been a dramatic increase in the application of probabilistic models to natural language processing over the last few years. the appeal of stochastic techniques over traditional rule-based techniques comes from the ease with which the necessary statistics can be automatically acquired and the fact that very little handcrafted knowledge need be built into the system. in contrast, the rules in rule-based systems are usually difficult to construct and are typically not very robust. one area in which the statistical approach has done particularly well is automatic part of speech tagging, assigning each word in an input sentence its proper part of speech [church 88; cutting et al. 92; derose 88; deroualt and merialdo 86; garside et al. 87; jelinek 85; kupiec 89; meteer et al. 911. stochastic taggers have obtained a high degree of accuracy without performing any syntactic analysis on the input. these stochastic part of speech taggers make use of a markov model which captures lexical and contextual information. the parameters of the model can be estimated from tagged ([church 88; derose 88; deroualt and merialdo 86; garside et al. 87; meteer et al. 91]) or untag,ged ([cutting et al. 92; jelinek 85; kupiec 89]) text. once the parameters of the model are estimated, a sentence can then be automatically tagged by assigning it the tag sequence which is assigned the highest probability by the model. performance is often enhanced with the aid of various higher level pre- and postprocessing procedures or by manually tuning the model. a number of rule-based taggers have been built [klein and simmons 63; green and rubin 71; hindle 89]. [klein and simmons 63] and [green and rubin 71] both have error rates substantially higher than state of the art stochastic taggers. [hindle 89] disambiguates words within a deterministic parser. we wanted to determine whether a simple rule-based tagger without any knowledge of syntax can perform as well as a stochastic tagger, or if part of speech tagging really is a domain to which stochastic techniques are better suited. in this paper we describe a rule-based tagger which performs as well as taggers based upon probabilistic models. the rule-based tagger overcomes the limitations common in rule-based approaches to language processing: it is robust, and the rules are automatically acquired. in addition, the tagger has many advantages over stochastic taggers, including: a vast reduction in stored information required, the perspicuity of a small set of meaningful rules as opposed to the large tables of statistics needed for stochastic taggers, ease of finding and implementing improvements to the tagger, and better portability from one tag set or corpus genre to another.we have presented a simple part of speech tagger which performs as well as existing stochastic taggers, but has significant advantages over these taggers. there has been a dramatic increase in the application of probabilistic models to natural language processing over the last few years. the fact that the simple rule-based tagger can perform so well should offer encouragement for researchers to further explore rule-based tagging, searching for a better and more expressive set of patch templates and other variations on this simple but effective theme. in addition, the tagger has many advantages over stochastic taggers, including: a vast reduction in stored information required, the perspicuity of a small set of meaningful rules as opposed to the large tables of statistics needed for stochastic taggers, ease of finding and implementing improvements to the tagger, and better portability from one tag set or corpus genre to another. the rule-based tagger overcomes the limitations common in rule-based approaches to language processing: it is robust, and the rules are automatically acquired. perhaps the biggest contribution of this work is in demonstrating that the stochastic method is not the only viable approach for part of speech tagging. the tagger is extremely portable. the appeal of stochastic techniques over traditional rule-based techniques comes from the ease with which the necessary statistics can be automatically acquired and the fact that very little handcrafted knowledge need be built into the system. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A94-1006.txt b/DATASET_PACSUM/dataset/inputs/A94-1006.txt new file mode 100644 index 0000000000000000000000000000000000000000..1406168a5ea20ce425d76718f95ded102ef68047 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A94-1006.txt @@ -0,0 +1 @@ +the statistical corpus-based renaissance in computational linguistics has produced a number of interesting technologies, including part-of-speech tagging and bilingual word alignment. unfortunately, these technologies are still not as widely deployed in practical applications as they might be. part-ofspeech taggers are used in a few applications, such as speech synthesis (sproat et al., 1992) and question answering (kupiec, 1993b). word alignment is newer, found only in a few places (gale and church, 1991a; brown et al., 1993; dagan et al., 1993). it is used at ibm for estimating parameters of their statistical machine translation prototype (brown et al., 1993). we suggest that part of speech tagging and word alignment could have an important role in glossary construction for translation. glossaries are extremely important for translation. how would microsoft, or some other software vendor, want the term "character menu" to be translated in their manuals? technical terms are difficult for translators because they are generally not as familiar with the subject domain as either the author of the source text or the reader of the target text. in many cases, there may be a number of acceptable translations, but it is important for the sake of consistency to standardize on a single one. it would be unacceptable for a manual to use a variety of synonyms for a particular menu or button. customarily, translation houses make extensive job-specific glossaries to ensure consistency and correctness of technical terminology for large jobs. a glossary is a list of terms and their translations.' we will subdivide the task of constructing a glossary into two subtasks: (1) generating a list of terms, and (2) finding the translation equivalents. the first task will be referred to as the monolingual task and the second as the bilingual task. how should a glossary be constructed? translation schools teach their students to read as much background material as possible in both the source and target languages, an extremely time-consuming process, as the introduction to hann's (1992, p. 8) text on technical translation indicates: contrary to popular opinion, the job of a technical translator has little in common with other linguistic professions, such as literature translation, foreign correspondence or interpreting. apart from an expert knowledge of both languages..., all that is required for the latter professions is a few general dictionaries, whereas a technical translator needs a whole library of specialized dictionaries, encyclopedias and 'the source and target fields are standard, though many other fields can also be found, e.g., usage notes, part of speech constraints, comments, etc. technical literature in both languages; he is more concerned with the exact meanings of terms than with stylistic considerations and his profession requires certain 'detective' skills as well as linguistic and literary ones. beginners in this profession have an especially hard time... this book attempts to meet this requirement. unfortunately, the academic prescriptions are often too expensive for commercial practice. translators need just-in-time glossaries. they cannot afford to do a lot of background reading and "detective" work when they are being paid by the word. they need something more practical. we propose a tool, termight, that automates some of the more tedious and laborious aspects of terminology research. the tool relies on part-of-speech tagging and word-alignment technologies to extract candidate terms and translations. it then sorts the extracted candidates and presents them to the user along with reference concordance lines, supporting efficient construction of glossaries. the tool is currently being used by the translators at at&t business translation services (formerly at&t language line services). termight may prove useful in contexts other than human-based translation. primarily, it can support customization of machine translation (mt) lexicons to a new domain. in fact, the arguments for constructing a job-specific glossary for human-based translation may hold equally well for an mt-based process, emphasizing the need for a productivity tool. the monolingual component of termight can be used to construct terminology lists in other applications, such as technical writing, book indexing, hypertext linking, natural language interfaces, text categorization and indexing in digital libraries and information retrieval (salton, 1988; cherry, 1990; harding, 1982; bourigault, 1992; damerau, 1993), while the bilingual component can be useful for information retrieval in multilingual text collections (landauer and littman, 1990).we have shown that terminology research provides a good application for robust natural language technology, in particular for part-of-speech tagging and word-alignment algorithms. the statistical corpus-based renaissance in computational linguistics has produced a number of interesting technologies, including part-of-speech tagging and bilingual word alignment. in particular, we have found the following to be very effective: as the need for efficient knowledge acquisition tools becomes widely recognized, we hope that this experience with termight will be found useful for other text-related systems as well. unfortunately, these technologies are still not as widely deployed in practical applications as they might be. in fact, the arguments for constructing a job-specific glossary for human-based translation may hold equally well for an mt-based process, emphasizing the need for a productivity tool. the monolingual component of termight can be used to construct terminology lists in other applications, such as technical writing, book indexing, hypertext linking, natural language interfaces, text categorization and indexing in digital libraries and information retrieval (salton, 1988; cherry, 1990; harding, 1982; bourigault, 1992; damerau, 1993), while the bilingual component can be useful for information retrieval in multilingual text collections (landauer and littman, 1990). primarily, it can support customization of machine translation (mt) lexicons to a new domain. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A94-1009.txt b/DATASET_PACSUM/dataset/inputs/A94-1009.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8f2b2edeb51f0d814745bf74b4783bb6a0b86bb --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A94-1009.txt @@ -0,0 +1 @@ +part-of-speech tagging is the process of assigning grammatical categories to individual words in a corpus. one widely used approach makes use of a statistical technique called a hidden markov model (hmm). the model is defined by two collections of parameters: the transition probabilities, which express the probability that a tag follows the preceding one (or two for a second order model); and the lexical probabilities, giving the probability that a word has a given tag without regard to words on either side of it. to tag a text, the tags with non-zero probability are hypothesised for each word, and the most probable sequence of tags given the sequence of words is determined from the probabilities. two algorithms are commonly used, known as the forward-backward (fb) and viterbi algorithms. fb assigns a probability to every tag on every word, while viterbi prunes tags which cannot be chosen because their probability is lower than the ones of competing hypotheses, with a corresponding gain in computational efficiency. for an introduction to the algorithms, see cutting et at. (1992), or the lucid description by sharman (1990). there are two principal sources for the parameters of the model. if a tagged corpus prepared by a human annotator is available, the transition and lexical probabilities can be estimated from the frequencies of pairs of tags and of tags associated with words. alternatively, a procedure called baumwelch (bw) re-estimation may be used, in which an untagged corpus is passed through the fb algorithm with some initial model, and the resulting probabilities used to determine new values for the lexical and transition probabilities. by iterating the algorithm with the same corpus, the parameters of the model can be made to converge on values which are locally optimal for the given text. the degree of convergence can be measured using a perplexity measure, the sum of plog2p for hypothesis probabilities p, which gives an estimate of the degree of disorder in the model. the algorithm is again described by cutting et ad. and by sharman, and a mathematical justification for it can be found in huang et at. (1990). the first major use of hmms for part of speech tagging was in claws (garside et a/., 1987) in the 1970s. with the availability of large corpora and fast computers, there has been a recent resurgence of interest, and a number of variations on and alternatives to the fb, viterbi and bw algorithms have been tried; see the work of, for example, church (church, 1988), brill (brill and marcus, 1992; brill, 1992), derose (derose, 1988) and kupiec (kupiec, 1992). one of the most effective taggers based on a pure hmm is that developed at xerox (cutting et al., 1992). an important aspect of this tagger is that it will give good accuracy with a minimal amount of manually tagged training data. 96% accuracy correct assignment of tags to word token, compared with a human annotator, is quoted, over a 500000 word corpus. the xerox tagger attempts to avoid the need for a hand-tagged training corpus as far as possible. instead, an approximate model is constructed by hand, which is then improved by bw re-estimation on an untagged training corpus. in the above example, 8 iterations were sufficient. the initial model set up so that some transitions and some tags in the lexicon are favoured, and hence having a higher initial probability. convergence of the model is improved by keeping the number of parameters in the model down. to assist in this, low frequency items in the lexicon are grouped together into equivalence classes, such that all words in a given equivalence class have the same tags and lexical probabilities, and whenever one of the words is looked up, then the data common to all of them is used. re-estimation on any of the words in a class therefore counts towards re-estimation for all of them'. the results of the xerox experiment appear very encouraging. preparing tagged corpora either by hand is labour-intensive and potentially error-prone, and although a semi-automatic approach can be used (marcus et al., 1993), it is a good thing to reduce the human involvement as much as possible. however, some careful examination of the experiment is needed. in the first place, cutting et a/. do not compare the success rate in their work with that achieved from a hand-tagged training text with no re-estimation. secondly, it is unclear how much the initial biasing contributes the success rate. if significant human intervention is needed to provide the biasing, then the advantages of automatic training become rather weaker, especially if such intervention is needed on each new text domain. the kind of biasing cutting et a/. describe reflects linguistic insights combined with an understanding of the predictions a tagger could reasonably be expected to make and the ones it could not. the aim of this paper is to examine the role that training plays in the tagging process, by an experimental evaluation of how the accuracy of the tagger varies with the initial conditions. the results suggest that a completely unconstrained initial model does not produce good quality results, and that one 'the technique was originally developed by kupiec (kupiec, 1989). accurately trained from a hand-tagged corpus will generally do better than using an approach based on re-estimation, even when the training comes from a different source. a second experiment shows that there are different patterns of re-estimation, and that these patterns vary more or less regularly with a broad characterisation of the initial conditions. the outcome of the two experiments together points to heuristics for making effective use of training and reestimation, together with some directions for further research. work similar to that described here has been carried out by merialdo (1994), with broadly similar conclusions. we will discuss this work below. the principal contribution of this work is to separate the effect of the lexical and transition parameters of the model, and to show how the results vary with different degree of similarity between the training and test data.from the observations in the previous section, we propose the following guidelines for how to train a hmm for use in tagging: able, use bw re-estimation with standard convergence tests such as perplexity. the principal contribution of this work is to separate the effect of the lexical and transition parameters of the model, and to show how the results vary with different degree of similarity between the training and test data. part-of-speech tagging is the process of assigning grammatical categories to individual words in a corpus. one widely used approach makes use of a statistical technique called a hidden markov model (hmm). we will discuss this work below. in the end it may turn out there is simply no way of making the prediction without a source of information extrinsic to both model and corpus. work similar to that described here has been carried out by merialdo (1994), with broadly similar conclusions. the general pattern of the results presented does not vary greatly with the corpus and tagset used. during the first experiment, it became apparent that baum-welch re-estimation sometimes decreases the accuracy as the iteration progresses. to tag a text, the tags with non-zero probability are hypothesised for each word, and the most probable sequence of tags given the sequence of words is determined from the probabilities. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A94-1016.txt b/DATASET_PACSUM/dataset/inputs/A94-1016.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebe24f41cfb7a188ba891fe3d19fb7c42a1737e0 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A94-1016.txt @@ -0,0 +1 @@ +machine-readable dictionary (the collins spanish/english), the lexicons used by the kbmt modules, a large set of user-generated bilingual glossaries as well as a gazetteer and a list of proper and organization names. the outputs from these engines (target language words and phrases) are recorded in a chart whose positions correspond to words in the source language input. as a result of the operation of each of the mt engines, new edges are added to the chart, each labeled with the translation of a region of the input string and indexed by this region's beginning and end positions. we will refer to all of these edges as components (as in "components of the translation") for the remainder of this article. the kbmt and ebmt engines also carry a quality score for each output element. the kbmt scores are produced based on whether any questionable heuristics were used in the source analysis or target generation. the ebmt scores are produced using a technique based on human judgements, as described in (nirenburg et al., 1994a), submitted. figure 1 presents a general view of the operation of our multi-engine mt system. the chart manager selects the overall best cover from the collection of candidate partial translations by normalizing each component's quality score (positive, with larger being better), and then selecting the best combination of components with the help of the chart walk algorithm. figure 2 illustrates the result of this process on the example spanish sentence: al momenta de su yenta a iberia, viasa contaba con ocho aviones, que tenzan en promedio 13 anos de vuelo which can be translated into english as at the moment of its sale to iberia, viasa had eight airplanes, which had on average thirteen years of flight (time). this is a sentence from one of the 1993 arpa mt evaluation texts. for each component, the starting and ending positions in the chart, the corresponding source language words, and alternative translations are shown, as well as the engine and the engine-internal quality scores. inspection of these translations shows numerous problems; for example, at position 12, "aviones" is translated, among other things, as "aircrafts". it must be remembered that these were generated automatically from an on-line dictionary, without any lexical feature marking or other human intervention. it is well known that such automatic methods are at the moment less than perfect, to say the least. in our current system, this is not a major problem, since the results go through a mandatory editing step, as described below. the chart manager normalizes the internal scores to make them directly comparable. in the case of kbmt and ebmt, the pre-existing scores are modified, while lexical transfer results are scored based on the estimated reliability of individual databases, from 0.5 up to 15. currently the kbmt scores are reduced by a constant, except for known erroneous output, which has its score set to zero. the internal ebmt scores range from 0 being perfect to 10,000 being worthless; but the scores are nonlinear. so a region selected by a threshold is converted linearly into scores ranging from zero to a normalized maximum ebmt score. the normalization levels were empirically determined in the initial experiment by having several individuals judge the comparative average quality of the outputs in an actual translation run. in every case, the base score produced by the scoring functions is currently multiplied by the length of the candidate in words, on the assumption that longer items are better. we intend to test a variety of functions in order to find the right contribution of the length factor. figure 3 presents the chart walk algorithm used to produce a single, best, non-overlapping, contiguous combination (cover) of the available component translations, assuming correct component quality scores. the code is organized as a recursive divideand-conquer procedure: to calculate the cover of a region of the input, it is repeatedly split into two parts, at each possible position. each time, the best possible cover for each part is recursively found, and the two scores are combined to give a score for the chart walk containing the two best subwalks. these different splits are then compared with each other and with components from the chart spanning the whole region (if any), and the overall best result is without dynamic programming, this would have a d 2 combinatorial time complexity. dynamic programl 2.5 ming utilizes a large array to store partial results, so that the best cover of any given subsequence is only computed once; the second time that a recursive call would compute the same result, it is retrieved from the array instead. this reduces the time complexity to 0(n3), and in practice it uses an insignificant part of total processing time. g 5 all possible combinations of components are cornd 2 pared: this is not a heuristic method, but an efficient exhaustive one. this is what assures that the chog 5 sen cover is optimal. this assumes, in addition to the scores actually being correct, that the scores are compositional, in the sense that the combined score for a set of components really represents their quality as a group. this might not be the case, for example, if gaps or overlaps are allowed in some cases (perhaps where they contain the same words in the same positions). we calculate the combined score for a sequence of d 2 components as the weighted average of their individual scores. weighting by length is necessary so that g 5 the same components, when combined in a different order, produce the same combined scores. otherwise the algorithm can produce inconsistent results. e 8.8 the chart walk algorithm can also be thought of as filling in the two-dimensional dynamic-programming arrayl . figure 4 shows an intermediate point in the filling of the array. in this figure, each element (i,j) is initially the best score of any single chart compod 2 nent covering the input region from word i to word j. dashes indicate that no one component covers exnote that this array is a different data structure from the chart. actly that region. (in rows 1 through 7, the array has not yet been operated on, so it still shows its initial state.) after processing (see rows 9 through 22), each element is the score for the best set of components covering the input from word i to word j (the best cover for this substring)2. (only a truncated score is shown for each element in the figure, for readability. there is also a list of best components associated with each element.) the array is upper triangular since the starting position of a component i must be less than or equal to its ending position j. for any position, the score is calculated based on a combination of scores in the row to its left and in the column below it, versus the previous contents of the array cell for its position. so the array must be filled from the bottom-up, and left to right. intuitively, this is because larger regions must be built up from smaller regions within them. for example, to calculate element (8,10), we compute the length-weighted averages of the scores of the best walks over the pair of elements (8,8) and (9,10) versus the pair (8,9) and (10,10), and compare them with the scores of any single chart components going from 8 to 10 (there were none), and take the maximum. referring to figure 2 again, this corresponds to a choice between combining the translations of (8,8) viasa and (9,10) contaba con versus combining the (not shown) translations of (8,9) viasa contaba and (10,10) con. (this (8,9) element was itself previously built up from single word components.) thus, we compare (2*1+ 10*2)/3 = 7.33 with (3.5*2+2*1)/3 = 3.0 and select the first, 7.33. the first wins because contaba con has a high score as an idiom from the glossary. figure 5 shows the final array. when the element in the top-right corner is produced (5.78), the algorithm is finished, and the associated set of components is the final chart walk result shown in figure 2. it may seem that the scores should increase towards the top-right corner. this has not generally been the case. while the system produces a number of high-scoring short components, many lowscoring components have to be included to span the entire input. since the score is a weighted average, these low-scoring components pull the combined score down. a clear example can be seen at position (18,18), which has a score of 15. the scores above and to its right each average this 15 with a 5, for total values of 10.0 (all the lengths happen to be 1), and the score continues to decrease with distance from this point as one moves towards the final score, which does include the component for (18,18) in the cover. the chart-oriented integration of mt engines does not easily support deviations from the linear order of the source text elements, as when discontinuous constituents translate contiguous strings or in the case of cross-component substring order differences. we use a language pair-dependent set of postprocessing rules to alleviate this (for example, by switching the order of adjacent single-word adjective and noun components).ultimately, a multi-engine system depends on the quality of each particular engine. a less ambitious version of this idea would be to run the low-scoring engines only where there are gaps in the normally high-scoring engines. we use a language pair-dependent set of postprocessing rules to alleviate this (for example, by switching the order of adjacent single-word adjective and noun components). the outputs from these engines (target language words and phrases) are recorded in a chart whose positions correspond to words in the source language input. the chart-oriented integration of mt engines does not easily support deviations from the linear order of the source text elements, as when discontinuous constituents translate contiguous strings or in the case of cross-component substring order differences. as a result of the operation of each of the mt engines, new edges are added to the chart, each labeled with the translation of a region of the input string and indexed by this region's beginning and end positions. we will refer to all of these edges as components (as in "components of the translation") for the remainder of this article. machine-readable dictionary (the collins spanish/english), the lexicons used by the kbmt modules, a large set of user-generated bilingual glossaries as well as a gazetteer and a list of proper and organization names. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1004.txt b/DATASET_PACSUM/dataset/inputs/A97-1004.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dc352e7bc46c9b1a06a7ea26409c55697ab7bcf --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1004.txt @@ -0,0 +1 @@ +the task of identifying sentence boundaries in text has not received as much attention as it deserves. many freely available natural language processing tools require their input to be divided into sentences, but make no mention of how to accomplish this (e.g. (brill, 1994; collins, 1996)). others perform the division implicitly without discussing performance (e.g. (cutting et al., 1992)). on first glance, it may appear that using a short list, of sentence-final punctuation marks, such as ., ?, and !, is sufficient. however, these punctuation marks are not used exclusively to mark sentence breaks. for example, embedded quotations may contain any of the sentence-ending punctuation marks and . is used as a decimal point, in email addresses, to indicate ellipsis and in abbreviations. both ! and ? are somewhat less ambiguous *the authors would like to acknowledge the support of arpa grant n66001-94-c-6043, aro grant daah0494-g-0426 and nsf grant sbr89-20230. but appear in proper names and may be used multiple times for emphasis to mark a single sentence boundary. lexically-based rules could be written and exception lists used to disambiguate the difficult cases described above. however, the lists will never be exhaustive, and multiple rules may interact badly since punctuation marks exhibit absorption properties. sites which logically should be marked with multiple punctuation marks will often only have one ((nunberg, 1990) as summarized in (white, 1995)). for example, a sentence-ending abbreviation will most likely not be followed by an additional period if the abbreviation already contains one (e.g. note that d.0 is followed by only a single . in the president lives in washington, d.c.). as a result, we believe that manually writing rules is not a good approach. instead, we present a solution based on a maximum entropy model which requires a few hints about what. information to use and a corpus annotated with sentence boundaries. the model trains easily and performs comparably to systems that require vastly more information. training on 39441 sentences takes 18 minutes on a sun ultra sparc and disambiguating the boundaries in a single wall street journal article requires only 1.4 seconds.we would also like to thank the anonymous reviewers for their helpful insights. training on 39441 sentences takes 18 minutes on a sun ultra sparc and disambiguating the boundaries in a single wall street journal article requires only 1.4 seconds. the task of identifying sentence boundaries in text has not received as much attention as it deserves. we would like to thank david palmer for giving us the test data he and marti hearst used for their sentence detection experiments. the model trains easily and performs comparably to systems that require vastly more information. to our knowledge, there have been few papers about identifying sentence boundaries. liberman and church suggest in (liberma.n and church, 1992) that a. system could be quickly built to divide newswire text into sentences with a nearly negligible error rate, but do not actually build such a system. we have described an approach to identifying sentence boundaries which performs comparably to other state-of-the-art systems that require vastly more resources. instead, we present a solution based on a maximum entropy model which requires a few hints about what. information to use and a corpus annotated with sentence boundaries. we present two systems for identifying sentence boundaries. many freely available natural language processing tools require their input to be divided into sentences, but make no mention of how to accomplish this (e.g. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1011.txt b/DATASET_PACSUM/dataset/inputs/A97-1011.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a03cbdad751a9620148461368c62fcf12a1210a --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1011.txt @@ -0,0 +1 @@ +we are concerned with surface-syntactic parsing of running text. our main goal is to describe syntactic analyses of sentences using dependency links that show the head-modifier relations between words. in addition, these links have labels that refer to the syntactic function of the modifying word. a simplified example is in figure 1, where the link between i and see denotes that i is the modifier of see and its syntactic function is that of subject. similarly, a modifies bird, and it is a determiner. first, in this paper, we explain some central concepts of the constraint grammar framework from which many of the ideas are derived. then, we give some linguistic background to the notations we are using, with a brief comparison to other current dependency formalisms and systems. new formalism is described briefly, and it is utilised in a small toy grammar to illustrate how the formalism works. finally, the real parsing system, with a grammar of some 2 500 rules, is evaluated. the parser corresponds to over three man-years of work, which does not include the lexical analyser and the morphological disambiguator, both parts of the existing english constraint grammar parser (karlsson et al., 1995). the parsers can be tested via www'.voutilainen and juha heikkild created the original engcg lexicon. we are using atro voutilainen's (1995) improved part-of-speech disambiguation grammar which runs in the cg-2 parser. the parsers can be tested via www'. we are concerned with surface-syntactic parsing of running text. in this paper, we have presented some main features of our new framework for dependency syntax. however, the comparison to other current systems suggests that our dependency parser is very promising both theoretically and practically. our work is partly based on the work done with the constraint grammar framework that was originally proposed by fred karlsson (1990). for instance, the results are not strictly comparable because the syntactic description is somewhat different. the evaluation was done using small excerpts of data, not used in the development of the system. our main goal is to describe syntactic analyses of sentences using dependency links that show the head-modifier relations between words. the distinction between the complements and the adjuncts is vague in the implementation; neither the complements nor the adjuncts are obligatory. means that a nominal head (nom-head is a set that contains part-of-speech tags that may represent a nominal head) may not appear anywhere to the left (not *-1). this "anywhere" to the left or right may be restricted by barriers, which restrict the area of the test. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1014.txt b/DATASET_PACSUM/dataset/inputs/A97-1014.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd1d57843ee04f1e7765c95053f703696bec15e4 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1014.txt @@ -0,0 +1 @@ +the work reported in this paper aims at providing syntactically annotated corpora (treebanks') for stochastic grammar induction. in particular, we focus on several methodological issues concerning the annotation of non-configurational languages. in section 2, we examine the appropriateness of existing annotation schemes. on the basis of these considerations, we formulate several additional requirements. a formalism complying with these requirements is described in section 3. section 4 deals with the treatment of selected phenomena. for a description of the annotation tool see section 5.for a description of the annotation tool see section 5. its extension is subject to further investigations. as the annotation scheme described in this paper focusses on annotating argument structure rather than constituent trees, it differs from existing treebanks in several aspects. the work reported in this paper aims at providing syntactically annotated corpora (treebanks') for stochastic grammar induction. these differences can be illustrated by a comparison with the penn treebank annotation scheme. a uniform representation of local and non-local dependencies makes the structure more transparent'. partial automation included in the current version significantly reduces the manna.1 effort. the development of linguistically interpreted corpora presents a laborious and time-consuming task. owing to the partial automation, the average annotation efficiency improves by 25% (from around 4 minutes to 3 minutes per sentence). combining raw language data with linguistic information offers a promising basis for the development of new efficient and robust nlp methods. such a word order independent representation has the advantage of all structural information being encoded in a single data structure. in order to make the annotation process more efficient, extra effort has been put. into the development of an annotation tool. realworld texts annotated with different strata of linguistic information can be used for grammar induction. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1029.txt b/DATASET_PACSUM/dataset/inputs/A97-1029.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac4fe8b372b5f2c235822ed7634e5b40c5f3151c --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1029.txt @@ -0,0 +1 @@ +in the past decade, the speech recognition community has had huge successes in applying hidden markov models, or hmm's to their problems. more recently, the natural language processing community has effectively employed these models for part-ofspeech tagging, as in the seminal (church, 1988) and other, more recent efforts (weischedel et al., 1993). we would now propose that hmm's have successfully been applied to the problem of name-finding. we have built a named-entity (ne) recognition system using a slightly-modified version of an hmm; we call our system "nymble". to our knowledge, nymble out-performs the best published results of any other learning name-finder. furthermore, it performs at or above the 90% accuracy level, often considered "near-human performance". the system arose from the ne task as specified in the last message understanding conference (muc), where organization names, person names, location names, times, dates, percentages and money amounts were to be delimited in text using sgml-markup. we will describe the various models employed, the methods for training these models and the method for "decoding" on test data (the term "decoding" borrowed from the speech recognition community, since one goal of traversing an hmm is to recover the hidden state sequence). to date, we have successfully trained and used the model on both english and spanish, the latter for met, the multi-lingual entity task.given the incredibly difficult nature of many nlp tasks, this example of a learned, stochastic approach to name-finding lends credence to the argument that the nlp community ought to push these approaches, to find the limit of phenomena that may be captured by probabilistic, finite-state methods. in the past decade, the speech recognition community has had huge successes in applying hidden markov models, or hmm's to their problems. we have shown that using a fairly simple probabilistic model, finding names and other numerical entities as specified by the muc tasks can be performed with "near-human performance", often likened to an f of 90 or above. to date, we have successfully trained and used the model on both english and spanish, the latter for met, the multi-lingual entity task. we would like to incorporate the following into the current model: while our initial results have been quite favorable, there is still much that can be done potentially to improve performance and completely close the gap between learned and rule-based name-finding systems. the basic premise of the approach is to consider the raw text encountered when decoding as though it had passed through a noisy channel, where it had been originally marked with named entities.' \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1030.txt b/DATASET_PACSUM/dataset/inputs/A97-1030.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd92ae12f46e7720595bbc7f481795d0537d3124 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1030.txt @@ -0,0 +1 @@ +text processing applications, such as machine translation systems, information retrieval systems or natural-language understanding systems, need to identify multi-word expressions that refer to proper names of people, organizations, places, laws and other entities. when encountering mrs. candy hill in input text, for example, a machine translation system should not attempt to look up the translation of candy and hill, but should translate mrs. to the appropriate personal title in the target language and preserve the rest of the name intact. similarly, an information retrieval system should not attempt to expand candy to all of its morphological variants or suggest synonyms (wacholder et al. 1994). the need to identify proper names has two aspects: the recognition of known names and the discovery of new names. since obtaining and maintaining a name database requires significant effort, many applications need to operate in the absence of such a resource. without a database, names need to be discovered in the text and linked to entities they refer to. even where name databases exist, text needs to be scanned for new names that are formed when entities, such as countries or commercial companies, are created, or for unknown names which become important when the entities they refer to become topical. this situation is the norm for dynamic applications such as news providing services or internet information indexing. the next section describes the different types of proper name ambiguities we have observed. section 3 discusses the role of context and world knowledge in their disambiguation; section 4 describes the process of name discovery as implemented in nominator, a module for proper name recognition developed at the ibm t.j. watson research center. sections 5-7 elaborate on nominator's disambiguation heuristics.sections 5-7 elaborate on nominator's disambiguation heuristics. ambiguity remains one of the main challenges in the processing of natural language text. because of these difficulties, we believe that for the forseeable future, practical applications to discover new names in text will continue to require the sort of human effort invested in nominator. text processing applications, such as machine translation systems, information retrieval systems or natural-language understanding systems, need to identify multi-word expressions that refer to proper names of people, organizations, places, laws and other entities. an evaluation of an earlier version of nominator, was performed on 88 wall street journal documents (nist 1993) that had been set aside for testing. in the rest of the paper we describe the resources and heuristics we have designed and implemented in nominator and the extent to which they resolve these ambiguities. name identification requires resolution of a subset of the types of structural and semantic ambiguities encountered in the analysis of nouns and noun phrases (nps) in natural language processing. all of these ambiguities must be dealt with if proper names are to be identified correctly. it assigns weak types such as ?human or fails to assign a type if the available information is not sufficient. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1039.txt b/DATASET_PACSUM/dataset/inputs/A97-1039.txt new file mode 100644 index 0000000000000000000000000000000000000000..215dd4e5140337f82fb95a9799cc818a33043bfc --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1039.txt @@ -0,0 +1 @@ +systems that generate natural language output as part of their interaction with a user have become a major area of research and development. typically, natural language generation is divided into several phases, namely text planning (determining output content and structure), sentence planning (determining abstract target language resources to express content, such as lexical items and syntactic constructions), and realization (producing the final text string) (reiter, 1994). while text and sentence planning may sometimes be combined, a realizer is almost always included as a distinct module. it is in the realizer that knowledge about the target language resides (syntax, morphology, idiosyncratic properties of lexical items). realization is fairly well understood both from a linguistic and from a computational point of view, and therefore most projects that use text generation do not include the realizer in the scope of their research. instead, such projects use an off-the-shelf realizer, among which penman (bateman, 1996) and surge/fuf (elhadad and robin, 1996) are probably the most popular. in this technical note and demo we present a new off-theshelf realizer, realpro. realpro is derived from previous systems (iordanskaja et al., 1988; iordanslcaja et al., 1992; rambow and korelsky, 1992), but represents a new design and a completely new implementation. realpro has the following characteristics, which we believe are unique in this combination: we reserve a more detailed comparison with penman and fuf, as well as with alethgen/gl (coch, 1996) (which is perhaps the system most similar to realpro, since they are based on the same linguistic theory and are both implemented with speed in mind), for a more extensive paper. this technical note presents realpro, concentrating on its structure, its coverage, its interfaces, and its performance.systems that generate natural language output as part of their interaction with a user have become a major area of research and development. this technical note presents realpro, concentrating on its structure, its coverage, its interfaces, and its performance. we are grateful to r. kittredge, t. korelsky, d. mccullough, a. nasr, e. reiter, and m. white as well as to three anonymous reviewers for helpful comments about earlier drafts of this technical note and/or about realpro. the input to realpro is a syntactic dependency structure. the development of realpro was partially supported by usaf rome laboratory under contracts f3060293-c-0015, f30602-94-c-0124, and f30602-92-c-0163, and by darpa under contracts f30602-95-2-0005 and f30602-96-c-0220. this means that realpro gives the developer control over the output, while taking care of the linguistic details. realpro is licensed free of charge to qualified academic institutions, and is licensed for a fee to commercial sites. the system is fully operational, runs on pc as well as on unix work stations, and is currently used in an application we have developed (lavoie et al., 1997) as well as in several on-going projects (weather report generation, machine translation, project report generation). the architecture of realpro is based on meaningtext theory, which posits a sequence of correspondences between different levels of representation. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/A97-1052.txt b/DATASET_PACSUM/dataset/inputs/A97-1052.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1bcbb3b4c6668b5a1fc871538aaa669fe0572da --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/A97-1052.txt @@ -0,0 +1 @@ +predicate subcategorization is a key component of a lexical entry, because most, if not all, recent syntactic theories 'project' syntactic structure from the lexicon. therefore, a wide-coverage parser utilizing such a lexicalist grammar must have access to an accurate and comprehensive dictionary encoding (at a minimum) the number and category of a predicate's arguments and ideally also information about control with predicative arguments, semantic selection preferences on arguments, and so forth, to allow the recovery of the correct predicate-argument structure. if the parser uses statistical techniques to rank analyses, it is also critical that the dictionary encode the relative frequency of distinct subcategorization classes for each predicate. several substantial machine-readable subcategorization dictionaries exist for english, either built largely automatically from machine-readable versions of conventional learners' dictionaries, or manually by (computational) linguists (e.g. the alvey nl tools (anlt) dictionary, boguraev et al. (1987); the comlex syntax dictionary, grishman et al. (1994)). unfortunately, neither approach can yield a genuinely accurate or comprehensive computational lexicon, because both rest ultimately on the manual efforts of lexicographers / linguists and are, therefore, prone to errors of omission and commission which are hard or impossible to detect automatically (e.g. boguraev & briscoe, 1989; see also section 3.1 below for an example). furthermore, manual encoding is labour intensive and, therefore, it is costly to extend it to neologisms, information not currently encoded (such as relative frequency of different subcategorizations), or other (sub)languages. these problems are compounded by the fact that predicate subcategorization is closely associated to lexical sense and the senses of a word change between corpora, sublanguages and/or subject domains (jensen, 1991). in a recent experiment with a wide-coverage parsing system utilizing a lexicalist grammatical framework, briscoe & carroll (1993) observed that half of parse failures on unseen test data were caused by inaccurate subcategorization information in the anlt dictionary. the close connection between sense and subcategorization and between subject domain and sense makes it likely that a fully accurate 'static' subcategorization dictionary of a language is unattainable in any case. moreover, although schabes (1992) and others have proposed `lexicalized' probabilistic grammars to improve the accuracy of parse ranking, no wide-coverage parser has yet been constructed incorporating probabilities of different subcategorizations for individual predicates, because of the problems of accurately estimating them. these problems suggest that automatic construction or updating of subcategorization dictionaries from textual corpora is a more promising avenue to pursue. preliminary experiments acquiring a few verbal subcategorization classes have been reported by brent (1991, 1993), manning (1993), and ushioda et at. (1993). in these experiments the maximum number of distinct subcategorization classes recognized is sixteen, and only ushioda et at. attempt to derive relative subcategorization frequency for individual predicates. we describe a new system capable of distinguishing 160 verbal subcategorization classes—a superset of those found in the anlt and comlex syntax dictionaries. the classes also incorporate information about control of predicative arguments and alternations such as particle movement and extraposition. we report an initial experiment which demonstrates that this system is capable of acquiring the subcategorization classes of verbs and the relative frequencies of these classes with comparable accuracy to the less ambitious extant systems. we achieve this performance by exploiting a more sophisticated robust statistical parser which yields complete though 'shallow' parses, a more comprehensive subcategorization class classifier, and a priori estimates of the probability of membership of these classes. we also describe a small-scale experiment which demonstrates that subcategorization class frequency information for individual verbs can be used to improve parsing accuracy.predicate subcategorization is a key component of a lexical entry, because most, if not all, recent syntactic theories 'project' syntactic structure from the lexicon. the experiment and comparison reported above suggests that our more comprehensive subcategorization class extractor is able both to assign classes to individual verbal predicates and also to rank them according to relative frequency with comparable accuracy to extant systems. boguraev & briscoe, 1987). we achieve this performance by exploiting a more sophisticated robust statistical parser which yields complete though 'shallow' parses, a more comprehensive subcategorization class classifier, and a priori estimates of the probability of membership of these classes. we also describe a small-scale experiment which demonstrates that subcategorization class frequency information for individual verbs can be used to improve parsing accuracy. therefore, a wide-coverage parser utilizing such a lexicalist grammar must have access to an accurate and comprehensive dictionary encoding (at a minimum) the number and category of a predicate's arguments and ideally also information about control with predicative arguments, semantic selection preferences on arguments, and so forth, to allow the recovery of the correct predicate-argument structure. brent's (1993) approach to acquiring subcategorization is based on a philosophy of only exploiting unambiguous and determinate information in unanalysed corpora. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C00-1007.txt b/DATASET_PACSUM/dataset/inputs/C00-1007.txt new file mode 100644 index 0000000000000000000000000000000000000000..af2341fad5cb33c33c0f6bb1730f5ca597911967 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C00-1007.txt @@ -0,0 +1 @@ +moreover, in ma w cases it; is very important not to deviate from certain linguis- tic standards in generation, in which case hand- crafted grammars give excellent control. how- ever, in other applications tbr nlg the variety of the output is much bigger, and the demands on the quality of the output somewhat less strin- gent. a typical example is nlg in the con- text of (interlingua- or transthr-based) machine translation. another reason for reb~xing the quality of the output may be that not enough time is available to develop a flfll grammar tbr a new target language in nlg. in all these cases, stochastic ("empiricist") methods pro- vide an alternative to hand-crafted ("rational- ist") approaches to nlg. to our knowledge, the first to use stochastic techniques in nlg were langkilde and knight (1998a) and (1998b). in this paper, we present fergus (flexible em- piricist/rationalist generation using syntax). fertgus follows langkilde and knights seminal work in using an n-gram language model, but; we augment it with a tree-based stochastic model and a traditional tree-based syntactic grammar. more recent work on aspects of stochastic gen- eration include (langkilde and knight, 2000), (malouf, 1999) and (ratnaparkhi, 2000). betbre we describe in more detail how we use stochastic models in nlg, we recall the basic tasks in nlg (rainbow and korelsky, 1992; re- iter, 1994). during text p lanning, content and structure of the target text; are determined to achieve the overall communicative goal. dur- ing sentence planning, linguistic means - in particular, lexical and syntactic means are de- termined to convey smaller pieces of meaning. l)uring real izat ion, the specification chosen in sentence planning is transtbrmed into a surface string, by line~rizing and intlecting words in the sentence (and typically, adding function words). as in the work by langkilde and knight, our work ignores the text planning stage, but it; does address the sentence, planning and the realiza- tion stages. the structure of the paper is as tbllows.explo i t ing a probabi l ist ic hierarchical mode l for generat ion srinivas bangalore and owen rambow at&t labs research 180 park avenue f lorham park, nj 07932 {sr in?, rambow}@research, a r t . \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C00-1044.txt b/DATASET_PACSUM/dataset/inputs/C00-1044.txt new file mode 100644 index 0000000000000000000000000000000000000000..d61720659a31c88b44aa157fd3be4a995329b338 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C00-1044.txt @@ -0,0 +1 @@ +such features include sense, register, do- main spccilicity, pragmatic restrictions on usage, scnlan- lic markcdncss, and orientation, as well as automatically ictcnlifiecl links between words (e.g., semantic rclalcd- hess, syllollynly, antonylny, and tneronymy). aulomal- ically learning features of this type from hugc corpora allows the construction or augmentation of lexicons, and the assignment of scmanlic htbcls lo words and phrases in running text. this information in turn can bc used to help dcterlninc addilional features at the it?teal, clause, sentence, or document level. tiffs paper explores lira benelits that some lexical fea- tures of adjectives offer lor the prediction of a contexlual sentence-level feature, suojectivity. subjectivity in nat- ural language re[crs to aspects of language used to ex- press opinions and ewfluations. the computatiomtl task addressed here is to distinguish sentences used to present opinions and other tbrms of subjectivity (suojective sen- tences, e.g., "at several different layers, its a fascinating title") from sentences used to objectively present factual information (objective sentences, e.g., "bell industries inc. increased its quarterly to 10 cents from 7 cents a share"). much research in discourse processing has focused on task-oriented and insmmtional dialogs. the task ad- dressed here comes to the fore in other genres, especially news reporting and lnternet lorums, in which opinions of various agents are expressed and where subjectivity judgements couht help in recognizing inllammatory rues- sages ("llanles) and mining online sources for product reviews. ()thor (asks for whicll subjectivity recognition is potentially very useful include infornmtion extraction and information retrieval. assigning sub.icctivity labels to documents or portions of documents is an example of non-topical characteri?ation f information. current in- formation extraction and rolricval lechnology focuses al- most exclusively on lhe subject matter of the documcnls. yet, additiomtl components of a document inllucncc its relevance to imrlicuhu ? users or tasks, including, for ex- alnple, the evidential slatus el: lhc material presented, and attitudes adopted in fawn" or against a lmrticular person, event, or posilion (e.g., articles on a presidenlial cam- paign wrillen to promote a specific candidate). in sum- marization, subjectivity judgmcnls could be included in documcllt proiilcs to augment aulomatically produced docunacnt summaries, and to hel l) the user make rele- vance judgments when using a search engine. ()thor work on sub.iectivity (wicbc et al., 1999; bruce and wicbc, 2000) has established a positive and statisti- cally signilicant correlation with the presence of adiec- lives.effects of adjective orientation and gradability on sentence subjectivity vas i le ios hatz ivass i log lou depar tment o1 computer sc ience co lumbia un ivers i l y new york, ny 10027 vh@cs , co lumbia , edu janyce m. wiebe depar tment o f computer sc ience new mex ico state un ivers i ty las cruces , nm 88003 w iebe@cs , nmsu. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C00-1072.txt b/DATASET_PACSUM/dataset/inputs/C00-1072.txt new file mode 100644 index 0000000000000000000000000000000000000000..78cc47ff7b0463d67101437fb4651831a64ad6f3 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C00-1072.txt @@ -0,0 +1 @@ +toi)ic signatures can lie used to identify the t)resence of a (:omph~x conce.pt a concept hat consists of several related coinl)onents in fixed relationships. ]~.c.stauvant-uisit, for examph~, invoh,es at h,ast the concel)ts lltcgfit, t.(tt, pay, and possibly waiter, all(l dragon boat pcstivai (in tat- wan) involves the ct)llc(!l)t,s cal(tlztlt,s (a talisman to ward off evil), rnoza (something with the t)ower of preventing pestilen(:e and strengthening health), pic- tures of ch, un9 kuei (a nemesis of evil spirits), eggs standing on end, etc. only when the concepts co- occur is one licensed to infer the comph:x concept; cat or moza alone, for example, are not sufficient. at this time, we do not c.onsider the imerrelationships among tile concepts. since many texts may describe all the compo- nents of a comi)lex concept without ever exi)lic- itly mentioning the mlderlying complex concel/t--a tol)ic--itself, systems that have to identify topic(s), for summarization or information retrieval, require a method of infcuring comt)hx concellts flom their component words in the text. 2 re la ted work in late 1970s, ])e.long (dejong, 1982) developed a system called i"tiump (fast reading understand- ing and memory program) to skim newspaper sto- ries and extract the main details. frump uses a data structure called sketchy script to organize its world knowhdge. each sketchy script is what frumi ) knows al)out what can occur in l)articu- lar situations such as denmnstrations, earthquakes, labor strike.s, an(t so on. frump selects a t)artic- ular sketchy script based on clues to styled events in news articles. in other words, frump selects an eml)t3 ~ t(uni)late 1whose slots will be tilled on the fly as t"f[ump reads a news artme. a summary is gen- erated })ased on what has been (:al)tured or filled in the teml)iate. the recent success of infornmtion extractk)n re- search has encoreaged the fi{um1 ) api)roach. the summons (summarizing online news artmes) system (mckeown and radev, 1999) takes tem- l)late outputs of information extra(:tion systems de- velofmd for muc conference and generating smn- maries of multit)le news artmes. frump and sum- mons both rely on t/rior knowledge of their do- mains, th)wever, to acquire such t)rior knowledge is lal)or-intensive and time-consuming. i~)r exam-- l)le, the unive.rsity of massa(:husetts circus sys- l.enl use(l ill the muc-3 (saic, 1998) terrorism do- main required about 1500 i)erson-llours to define ex- traction lmtterns 2 (rilotf, 1996).the automated acquisit ion of topic signatures for text summarizat ion chin -yew l in and eduard hovy in fo rmat ion s(:i(umes i l l s t i tu te un ivers i ty of southern ca l i fo rn ia mar ina del rey, ca 90292, usa { cyl,hovy }c~isi.edu abst rac t in order to produce, a good summary, one has to identify the most relevant portions of a given text. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C00-2136.txt b/DATASET_PACSUM/dataset/inputs/C00-2136.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ee003ae771174893e04ce8fee9812880e0b49f9 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C00-2136.txt @@ -0,0 +1 @@ +we evaluate exdisco by com- paring the pertbrmance of discovered patterns against that of manually constructed systems on actual extraction tasks. 0 introduct ion intbrmation extraction is the selective xtrac- tion of specified types of intbrmation from nat- ural language text. the intbrmation to be extracted may consist of particular semantic classes of objects (entities), relationships among these entities, and events in which these entities participate. the extraction system places this intbrmation into a data base tbr retrieval and subsequent processing. in this paper we shall be concerned primar- ily with the extraction of intbrmation about events. in the terminology which has evolved tiom the message understanding conferences (muc, 1995; muc, 1993), we shall use the term subject domain to refer to a broad class of texts, such as business news, and tile term scenario to refer to tile specification of tile particular events to be extracted. for example, the "manage- ment succession" scenario for muc-6, which we shall refer to throughout this paper, involves in- formation about corporate executives tarting and leaving positions. the fundamental problem we face in port- ing an extraction system to a new scenario is to identify the many ways in which intbrmation about a type of event may be expressed in the text;. typically, there will be a few common tbrms of expression which will quickly come to nfind when a system is being developed. how- ever, the beauty of natural language (and the challenge tbr computational linguists) is that there are many variants which an imaginative writer cast use, and which the system needs to capture. finding these variants may involve studying very large amounts of text; in the sub- ject domain. this has been a major impediment to the portability and performance of event ex- traction systems. we present; in this paper a new approach to finding these variants automatically flom a large corpus, without the need to read or amlo- tate the corpus. this approach as been evalu- ated on actual event extraction scenarios. in the next section we outline the strncture of our extraction system, and describe the discov- ery task in the context of this system.automatic acquisition of domain knowledge for information extraction roman yangarber, ralph grishman past tapanainen courant inst i tute of conexor oy mathemat ica l sciences helsinki, f in land new york university {roman [ grishman}@cs, nyu. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C00-2137.txt b/DATASET_PACSUM/dataset/inputs/C00-2137.txt new file mode 100644 index 0000000000000000000000000000000000000000..dab961e295dad1b5acc07397fdc2d08e722ce657 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C00-2137.txt @@ -0,0 +1 @@ +5/]lell ,]le lcsllll;s are better with the new tcch- ni(lue , a question arises as t() wh(,l;h(;r these l:(`-- sult; (litleren(:es are due t() the new technique a(:t;ually 1)eing l)cl;t(x or just; due 1;o (:han(:e. un- tortmmtely, one usually callll()t) directly answer the qnesl;ion "what is the 1)robatfility that 1;11(; now l;(x:hni(luc, is t)el;lx~r givell l;he results on the t(,sl, dal;a sol;": i)(new technique is better [ test set results) ]~ul; with statistics, one cml answer the follow- ing proxy question: if the new technique was a(> tually no ditterent han the old t(,(hnique ((;he * this paper reports on work l)erfonncd at the mitr1,; corporation under the sul)porl: of the mitilj,; ,qponsored research l)rogrmn. warren grcit[, l ,ynette il irschlnm b christilm l)orall, john llen(lerson, kelmeth church, ted l)unning, wessel kraaij, milch marcus and an anony- mous reviewer l)rovided hell)rid suggestions. copyright @2000 the mitre corl)oration. all rights r(~s(nvcd. null hyl)othesis), wh~tt is 1:11(; 1)robat)ility that the results on the test set would l)e at least this skewed in the new techniques favor (box eta] . thai; is, what is p(test se, t results at least this skew(a in the new techni(lues favor i new technique is no (liffercnt than the old) if the i)robtfl)ility is small enough (5% off;on is used as the threshold), then one will rqiect the mill hyi)othems and say that the differences in 1;he results are :sta.tisl;ically siglfilicant" ai; that thrt,shold level. this 1)al)(n" examines some of th(`- 1)ossil)le me?hods for trying to detect statistically signif- leant diflelenc(`-s in three commonly used met- li(:s: telall, 1)re(ision and balanced f-score. many of these met;ire(is arc foun(t to be i)rol)lem- a.ti(" ill a, so, t; of exl)erinw, nts that are performed. thes(~ methods have a, tendency to ullderesti- mat(`- th(, signili(:ance, of the results, which tends t() 1hake one, 1)elieve thai; some new techni(tuc is no 1)el;l;er l;lmn the (:urrent technique even when il; is. this mtderest imate comes flom these lnc|h- ells assuming l;hat; the te(:hlfi(tues being con> lmrcd produce indepen(lc, nt results when in our exl)eriments , the techniques 1)eing coml)ared tend to 1)reduce l)ositively corr(`-lated results. to handle this problem, we, point out some st~ttistical tests, like the lnatche(t-pair t, sign and wilcoxon tests (harnett, 1982, see. 8.7 and 15.5), which do not make this assulnption. one call its(, l;llcse tes ts oll i;hc recall nlel;r ic, but l;he precision an(l 1)alanced f-score metric have too coml)lex a tbrm for these tests. for such com- 1)lex lne|;ri(;s~ we llse a colnplll;e-in|;clisiv(~ ran- domization test (cohen, 1995, sec. 5.3), which also ~tvoids this indet)en(lence assmnption.more accurate tes ts ibr the s ta t i s t i ca l s ign i f i cance of resu l t d i f ferences * alexander yeh mitre corp. 202 burli l lgl;on rd. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C00-2163.txt b/DATASET_PACSUM/dataset/inputs/C00-2163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6659bb05529607e15d412f59f316075ca872c8ed --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C00-2163.txt @@ -0,0 +1 @@ +here .fi = f denotes tile (15ench) source and e{ = e denotes the (english) target string. most smt models (brown et al., 1993; vogel et al., 1996) try to model word-to-word corresl)ondences between source and target words using an alignment nmpl)ing from source l)osition j to target position i = aj. we can rewrite tim t)robal)ility pr(fille~) t) 3, in- troducing the hidden alignments ai 1 := al ...aj...a.l (aj c {0 , . , /} ) : pr(f~lel) = ~pr(f i ,a~le{) .1 ? j -1 i~ = e h pr(fj ajlf i -"al e l ) q, j=l to allow fbr french words wlfich do not directly cor- respond to any english word an artificial empty word c0 is added to the target sentence at position i=0. the different alignment models we present pro- vide different decoint)ositions of pr(f~,a~le(). an alignnlent 5~ for which holds a~ = argmax pr(fi , al[ei) at for a specific model is called v i terb i al ignment of" this model. in this paper we will describe extensions to tile hidden-markov alignment model froln (vogel et al., 1.996) and compare tlmse to models 1 - 4 of (brown et al., 1993). we t)roi)ose to measure the quality of an alignment nlodel using the quality of tlle viterbi alignment compared to a manually-produced align- ment. this has the advantage that once having pro- duced a reference alignlnent, the evaluation itself can be performed automatically. in addition, it results in a very precise and relia.ble valuation criterion which is well suited to assess various design decisions in modeling and training of statistical alignment mod- els. it, is well known that manually pertbrming a word aligmnent is a colnplicated and ambiguous task (melamed, 1998). therefore, to produce tlle refer- ence alignment we use a relined annotation scheme which reduces the complications and mnbiguities oc- curring in the immual construction of a word align- ment. as we use tile alignment models for machine translation purposes, we also evahlate the resulting translation quality of different nlodels. 2 al ignment w i th hmm in the hidden-markov alignment model we assume a first-order dependence for tim aligmnents aj and that the translation probability depends olfly on aj and not oil (tj_l: - ~- el) =p(ajl.a compar i son of a l ignment mode ls for s ta t i s t i ca l mach ine trans la t ion franz josef och and hermann ney lehrstuhl fiir informatik vi, comlmter science department rwth aachen - university of technology d-52056 aachen, germany {och, ney}~inf ormat ik. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-1011.txt b/DATASET_PACSUM/dataset/inputs/C02-1011.txt new file mode 100644 index 0000000000000000000000000000000000000000..99e6c15c8a4800a21db08a8d75dfc01a61d5b758 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-1011.txt @@ -0,0 +1 @@ +we address here the problem of base np translation, in which for a given base noun phrase in a source language (e.g., ?information age? in english), we are to find out its possible translation(s) in a target language (e.g., ? in chinese). we define a base np as a simple and non-recursive noun phrase. in many cases, base nps represent holistic and non-divisible concepts, and thus accurate translation of them from one language to another is extremely important in applications like machine translation, cross language information retrieval, and foreign language writing assistance. in this paper, we propose a new method for base np translation, which contains two steps: (1) translation candidate collection, and (2) translation selection. in translation candidate collection, for a given base np in the source language, we look for its translation candidates in the target language. to do so, we use a word-to-word translation dictionary and corpus data in the target language on the web. in translation selection, we determine the possible translation(s) from among the candidates. we use non-parallel corpus data in the two languages on the web and employ one of the two methods which we have developed. in the first method, we view the problem as that of classification and employ an ensemble of na?ve bayesian classifiers constructed with the em algorithm. we will use ?em-nbc-ensemble? to denote this method, hereafter. in the second method, we view the problem as that of calculating similarities between context vectors and use tf-idf vectors also constructed with the em algorithm. we will use ?em-tf-idf? to denote this method. experimental results indicate that our method is very effective, and the coverage and top 3 accuracy of translation at the final stage are 91.4% and 79.8%, respectively. the results are significantly better than those of the baseline methods relying on existing technologies. the higher performance of our method can be attributed to the enormity of the web data used and the employment of the em algorithm.the higher performance of our method can be attributed to the enormity of the web data used and the employment of the em algorithm. we address here the problem of base np translation, in which for a given base noun phrase in a source language (e.g., ?information age? we also acknowledge shenjie li for help with program coding. this paper has proposed a new and effective method for base np translation by using web data and the em algorithm. the results are significantly better than those of the baseline methods relying on existing technologies. in english), we are to find out its possible translation(s) in a target language (e.g., ? 2.1 translation with non-parallel. we conducted experiments on translation of the base nps from english to chinese. experimental results indicate that our method is very effective, and the coverage and top 3 accuracy of translation at the final stage are 91.4% and 79.8%, respectively. in chinese). we extracted base nps (noun-noun pairs) from the encarta 1 english corpus using the tool developed by xun et al(2000). for nagata et al?s method, we found that it was almost impossible to find partial-parallel corpora in the non-web data. they observed that there are many partial parallel corpora between english and japanese on the web, and most typically english translations of japanese terms (words or phrases) are parenthesized and inserted immediately after the japanese terms in documents written in japanese. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-1054.txt b/DATASET_PACSUM/dataset/inputs/C02-1054.txt new file mode 100644 index 0000000000000000000000000000000000000000..c792410e8f85468111f2312ce6e853d61b3b4b78 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-1054.txt @@ -0,0 +1 @@ +named entity (ne) recognition is a task in whichproper nouns and numerical information in a docu ment are detected and classified into categories suchas person, organization, and date. it is a key technol ogy of information extraction and open-domain question answering (voorhees and harman, 2000). we are building a trainable open-domain question answering system called saiqa-ii. in this paper, we show that an ne recognizer based on support vector machines (svms) gives better scores thanconventional systems. svms have given high per formance in various classification tasks (joachims, 1998; kudo and matsumoto, 2001). however, it turned out that off-the-shelf svm classifiers are too inefficient for ne recognition. the recognizer runs at a rate of only 85 bytes/sec on an athlon 1.3 ghz linux pc, while rule-based systems (e.g., isozaki, (2001)) can process several kilobytes in a second. the major reason is the inefficiency of svm classifiers. there are otherreports on the slowness of svm classifiers. another svm-based ne recognizer (yamada and mat sumoto, 2001) is 0.8 sentences/sec on a pentium iii 933 mhz pc. an svm-based part-of-speech (pos). tagger (nakagawa et al, 2001) is 20 tokens/sec on an alpha 21164a 500 mhz processor. it is difficult to use such slow systems in practical applications. in this paper, we present a method that makes the ne system substantially faster. this method can also be applied to other tasks in natural languageprocessing such as chunking and pos tagging. another problem with svms is its incomprehensibil ity. it is not clear which features are important or how they work. the above method is also useful for finding useless features. we also mention a method to reduce training time. 1.1 support vector machines. suppose we have a set of training data for a two class problem: , where ffflfi is a feature vector of the ffi -th sample in the training data and !$#%# is the label forthe sample. the goal is to find a decision func tion that accurately predicts for unseen . a non-linear svm classifier gives a decision function ( ) * sign ,+-) for an input vector where +-) .* / 0 21)3 546879: !6; here, () *=!$# means is a member of a cer tain class and () $* # means is not a mem ber. 7 s are called support vectors and are repre sentatives of training examples. is the numberof support vectors. therefore, computational com plexity of +?) is proportional to . support vectorsand other constants are determined by solving a cer tain quadratic programming problem. 4687@ is akernel that implicitly maps vectors into a higher di mensional space. typical kernels use dot products: 4687@ a*cbed7@ . a polynomial kernel of degree fis given by bg? *hi#j!kg l . we can use vari mm m m n m m m m m m m m m n m o o o o o n o o o o o o o o o o o o m : positive example, o : negative example n m , n o : support vectors figure 1: support vector machine ous kernels, and the design of an appropriate kernel for a particular application is an important research issue.figure 1 shows a linearly separable case. the de cision hyperplane defined by +-) p*rq separatespositive and negative examples by the largest mar gin. the solid line indicates the decision hyperplaneand two parallel dotted lines indicate the margin be tween positive and negative examples. since such aseparating hyperplane may not exist, a positive pa rameter s is introduced to allow misclassifications. see vapnik (1995). 1.2 svm-based ne recognition. as far as we know, the first svm-based ne system was proposed by yamada et al (2001) for japanese.his system is an extension of kudo?s chunking sys tem (kudo and matsumoto, 2001) that gave the best performance at conll-2000 shared tasks. in theirsystem, every word in a sentence is classified sequentially from the beginning or the end of a sen tence. however, since yamada has not compared it with other methods under the same conditions, it is not clear whether his ne system is better or not. here, we show that our svm-based ne system ismore accurate than conventional systems. our sys tem uses the viterbi search (allen, 1995) instead of sequential determination.for training, we use ?crl data?, which was prepared for irex (information retrieval and extrac tion exercise1, sekine and eriguchi (2000)). it has about 19,000 nes in 1,174 articles. we also use additional data by isozaki (2001). both datasets are based on mainichi newspaper?s 1994 and 1995 cd-roms. we use irex?s formal test data calledgeneral that has 1,510 named entities in 71 ar ticles from mainichi newspaper of 1999. systems are compared in terms of general?s f-measure 1http://cs.nyu.edu/cs/projects/proteus/irexwhich is the harmonic mean of ?recall? and ?preci sion? and is defined as follows. recall = m/(the number of correct nes), precision = m/(the number of nes extracted by a system), where m is the number of nes correctly extracted and classified by the system.we developed an svm-based ne system by following our ne system based on maximum entropy (me) modeling (isozaki, 2001). we sim ply replaced the me model with svm classifiers.the above datasets are processed by a morphological analyzer chasen 2.2.12. it tokenizes a sen tence into words and adds pos tags. chasen uses about 90 pos tags such as common-noun and location-name. since most unknown words are proper nouns, chasen?s parameters for unknownwords are modified for better results. then, a char acter type tag is added to each word. it uses 17character types such as all-kanji and small integer. see isozaki (2001) for details. now, japanese ne recognition is solved by theclassification of words (sekine et al, 1998; borth wick, 1999; uchimoto et al, 2000). for instance, the words in ?president george herbert bush saidclinton is . . . are classified as follows: ?president? = other, ?george? = person-begin, ?her bert? = person-middle, ?bush? = person-end, ?said? = other, ?clinton? = person-single, ?is? = other. in this way, the first word of a person?s name is labeled as person-begin. the last word is labeled as person-end. other words in the nameare person-middle. if a person?s name is expressed by a single word, it is labeled as person single. if a word does not belong to any namedentities, it is labeled as other. since irex de fines eight ne classes, words are classified into 33 ( *utwvex!k# ) categories.each sample is represented by 15 features be cause each word has three features (part-of-speech tag, character type, and the word itself), and two preceding words and two succeeding words are also used for context dependence. although infrequent features are usually removed to prevent overfitting, we use all features because svms are robust. each sample is represented by a long binary vector, i.e., a sequence of 0 (false) and 1 (true). for instance, ?bush? in the above example is represented by a 2http://chasen.aist-nara.ac.jp/ vector p*yg[z\#^]_ g[z `a] described below. only 15 elements are 1. bdcfe8ghji // current word is not ?alice? bdc klghme // current word is ?bush? bdc nghji // current word is not ?charlie? : bdcfe^opikpqpghme // current pos is a proper noun bdcfe^opinipghji // current pos is not a verb : bdc nqre^sre ghji // previous word is not ?henry? bdc nqre^skghme // previous word is ?herbert? :here, we have to consider the following problems. first, svms can solve only a two-class problem. therefore, we have to reduce the above multi class problem to a group of two-class problems. second, we have to consider consistency among word classes in a sentence. for instance, a word classified as person-begin should be followed by person-middle or person-end. it impliesthat the system has to determine the best combina tions of word classes from numerous possibilities.here, we solve these problems by combining exist ing methods. there are a few approaches to extend svms to cover t -class problems. here, we employ the ?oneclass versus all others? approach. that is, each clas sifier (%u ) is trained to distinguish members of a class v from non-members. in this method, two or more classifiers may give !$# to an unseen vector or no classifier may give !$# . one common way to avoid such situations is to compare + u ) values and to choose the class index v of the largest + u ) . the consistency problem is solved by the viterbi search. since svms do not output probabilities, we use the svm+sigmoid method (platt, 2000). that is, we use a sigmoid function wxg? j*y#zi#{! |l}~ {g to map + u ) to a probability-like value. the output of the viterbi search is adjusted by a postprocessor for wrong word boundaries. the adjustment rules are also statistically determined (isozaki, 2001). 1.3 comparison of ne recognizers. we use a fixed value ?* #q9q . f-measures are not very sensitive to  unless  is too small. whenwe used 1,038,986 training vectors, general?s f measure was 89.64% for ?*?q?# and 90.03% for 6*?#q9q . we employ the quadratic kernel ( f *y? ) because it gives the best results. polynomial kernels of degree 1, 2, and 3 resulted in 83.03%, 88.31%, f-measure (%) ? ? rg+dt ? ? me ? ? svm 0 20 40 60 80 100 120 crl data ???e? ?^??:??? 76 78 80 82 84 86 88 90 number of nes in training data ( ?? ) figure 2: f-measures of ne systems and 87.04% respectively when we used 569,994 training vectors. figure 2 compares ne recognizers in terms ofgeneral?s f-measures. ?svm? in the figure in dicates f-measures of our system trained by kudo?s tinysvm-0.073 with s?*?q?# . it attained 85.04% when we used only crl data. ?me? indicates our me system and ?rg+dt? indicates a rule-basedmachine learning system (isozaki, 2001). according to this graph, ?svm? is better than the other sys tems.however, svm classifiers are too slow. fa mous svm-light 3.50 (joachims, 1999) took 1.2 days to classify 569,994 vectors derived from 2 mb documents. that is, it runs at only 19 bytes/sec. tinysvm?s classifier seems best optimized among publicly available svm toolkits, but it still works at only 92 bytes/sec.our svm-based ne recognizer attained f = 90.03%. we also thank shigeru katagiri and ken-ichiro ishii for their support. named entity (ne) recognition is a task in whichproper nouns and numerical information in a docu ment are detected and classified into categories suchas person, organization, and date. tinysvm?s classifier seems best optimized among publicly available svm toolkits, but it still works at only 92 bytes/sec. that is, it runs at only 19 bytes/sec. it is a key technol ogy of information extraction and open-domain question answering (voorhees and harman, 2000). fa mous svm-light 3.50 (joachims, 1999) took 1.2 days to classify 569,994 vectors derived from 2 mb documents. is better than the other sys tems.however, svm classifiers are too slow. in this paper, we show that an ne recognizer based on support vector machines (svms) gives better scores thanconventional systems. we are building a trainable open-domain question answering system called saiqa-ii. according to this graph, ?svm? svms have given high per formance in various classification tasks (joachims, 1998; kudo and matsumoto, 2001). indicates a rule-basedmachine learning system (isozaki, 2001). ?me? indicates our me system and ?rg+dt? however, it turned out that off-the-shelf svm classifiers are too inefficient for ne recognition. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-1114.txt b/DATASET_PACSUM/dataset/inputs/C02-1114.txt new file mode 100644 index 0000000000000000000000000000000000000000..d83065432f4e16493420fce8599699b053bb6c90 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-1114.txt @@ -0,0 +1 @@ +semantic knowledge for particular domains isincreasingly important in nlp. many applications such as word-sense disambiguation, in formation extraction and speech recognitionall require lexicons. the coverage of handbuilt lexical resources such as wordnet (fellbaum, 1998) has increased dramatically in re cent years, but leaves several problems andchallenges. coverage is poor in many criti cal, rapidly changing domains such as current affairs, medicine and technology, where much time is still spent by human experts employed to recognise and classify new terms. mostlanguages remain poorly covered in compari son with english. hand-built lexical resourceswhich cannot be automatically updated can of ten be simply misleading. for example, using wordnet to recognise that the word apple refers to a fruit or a tree is a grave error in the many situations where this word refers to a computer manufacturer, a sense which wordnet does notcover. for nlp to reach a wider class of appli cations in practice, the ability to assemble andupdate appropriate semantic knowledge auto matically will be vital. this paper describes a method for arranging semantic information into a graph (bolloba?s, 1998), where the nodes are words and the edges(also called links) represent relationships be tween words. the paper is arranged as follows. section 2 reviews previous work on semanticsimilarity and lexical acquisition. section 3 de scribes how the graph model was built from the pos-tagged british national corpus. section 4 describes a new incremental algorithm used to build categories of words step by step from thegraph model. section 5 demonstrates this algo rithm in action and evaluates the results againstwordnet classes, obtaining state-of-the-art re sults. section 6 describes how the graph modelcan be used to recognise when words are polysemous and to obtain groups of words represen tative of the different senses.semantic knowledge for particular domains isincreasingly important in nlp. section 6 describes how the graph modelcan be used to recognise when words are polysemous and to obtain groups of words represen tative of the different senses. so far we have presented a graph model built upon noun co-occurrence which performs much better than previously reported methods at the task of automatic lexical acquisition. 2 1http://infomap.stanford.edu/graphs 2http://muchmore.dfki.defigure 1: automatically generated graph show ing the word apple and semantically related nouns this isan important task, because assembling and tuning lexicons for specific nlp systems is increas ingly necessary. many applications such as word-sense disambiguation, in formation extraction and speech recognitionall require lexicons. section 5 demonstrates this algo rithm in action and evaluates the results againstwordnet classes, obtaining state-of-the-art re sults. this research was supported in part by theresearch collaboration between the ntt communication science laboratories, nippon tele graph and telephone corporation and csli,stanford university, and by ec/nsf grant ist 1999-11438 for the muchmore project. acknowledgements the authors would like to thank the anonymous reviewers whose comments were a great help inmaking this paper more focussed: any short comings remain entirely our own responsibility. we now take a step furtherand present a simple method for not only as sembling words with similar meanings, but for empirically recognising when a word has several meanings. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-1144.txt b/DATASET_PACSUM/dataset/inputs/C02-1144.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fb317e4c072022ebdce346f4f72532cbe5f0b42 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-1144.txt @@ -0,0 +1 @@ +broad-coverage lexical resources such as wordnet are extremely useful in applications such as word sense disambiguation (leacock, chodorow, miller 1998) and question answering (pasca and harabagiu 2001). however, they often include many rare senses while missing domain-specific senses. for example, in wordnet, the words dog, computer and company all have a sense that is a hyponym of person. such rare senses make it difficult for a coreference resolution system to use wordnet to enforce the constraint that personal pronouns (e.g. he or she) must refer to a person. on the other hand, wordnet misses the user-interface object sense of the word dialog (as often used in software manuals). one way to deal with these problems is to use a clustering algorithm to automatically induce semantic classes (lin and pantel 2001). many clustering algorithms represent a cluster by the centroid of all of its members (e.g., k means) (mcqueen 1967) or by a representative element (e.g., k-medoids) (kaufmann and rousseeuw 1987). when averaging over all elements in a cluster, the centroid of a cluster may be unduly influenced by elements that only marginally belong to the cluster or by elements that also belong to other clusters. for example, when clustering words, we can use the contexts of the words as features and group together the words that tend to appear in similar contexts. for instance, u.s. state names can be clustered this way because they tend to appear in the following contexts: (list a) ___ appellate court campaign in ___ ___ capital governor of ___ ___ driver's license illegal in ___ ___ outlaws sth. primary in ___ ___'s sales tax senator for ___ if we create a centroid of all the state names, the centroid will also contain features such as: (list b) ___'s airport archbishop of ___ ___'s business district fly to ___ ___'s mayor mayor of ___ ___'s subway outskirts of ___ because some of the state names (like new york and washington) are also names of cities. using a single representative from a cluster may be problematic too because each individual element has its own idiosyncrasies that may not be shared by other members of the cluster. in this paper, we propose a clustering algo rithm, cbc (clustering by committee), in which the centroid of a cluster is constructed by averaging the feature vectors of a subset of the cluster members. the subset is viewed as a committee that determines which other elements belong to the cluster. by carefully choosing committee members, the features of the centroid tend to be the more typical features of the target class. for example, our system chose the following committee members to compute the centroid of the state cluster: illinois, michigan, minnesota, iowa, wisconsin, indiana, nebraska and vermont. as a result, the centroid contains only features like those in list a. evaluating clustering results is a very difficult task. we introduce a new evaluation methodol ogy that is based on the editing distance between output clusters and classes extracted from wordnet (the answer key).we presented a clustering algorithm, cbc, for automatically discovering concepts from text. we introduce a new evaluation methodol ogy that is based on the editing distance between output clusters and classes extracted from wordnet (the answer key). this research was partly supported by natural sciences and engineering research council of canada grant ogp121338 and scholarship pgsb207797. as a result, the centroid contains only features like those in list a. evaluating clustering results is a very difficult task. however, they often include many rare senses while missing domain-specific senses. we generated clusters from a news corpus using cbc and compared them with classes extracted from wordnet (miller 1990). the parameters k and t are usually considered to be small numbers. broad-coverage lexical resources such as wordnet are extremely useful in applications such as word sense disambiguation (leacock, chodorow, miller 1998) and question answering (pasca and harabagiu 2001). five of the 943 clusters discovered by cbc from s13403 along with their features with top-15 highest mutual information and the wordnet classes that have the largest intersection with each cluster. test data. clustering algorithms are generally categorized as hierarchical and partitional. to extract classes from wordnet, we first estimate the probability of a random word belonging to a subhierarchy (a synset and its hyponyms). \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-1145.txt b/DATASET_PACSUM/dataset/inputs/C02-1145.txt new file mode 100644 index 0000000000000000000000000000000000000000..d22c2f7ae7d0dba962616c54d4a5251790d1e3cd --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-1145.txt @@ -0,0 +1 @@ +the penn chinese treebank (ctb) is an ongoing project, with its objective being to create a segmented chinese corpus annotated with pos tags and syntactic brackets. the first installment of the project (ctb-i) consists of xinhua newswire between the years 1994 and 1998, totaling 100,000 words, fully segmented, pos-tagged and syntactically bracketed and it has been released to the public via the penn linguistic data consortium (ldc). the preliminary results of this phase of the project have been reported in xia et al (2000). currently the second installment of the project, the 400,000-word ctb-ii is being developed and is expected to be completed early in the year 2003. ctb-ii will follow the standards set up in the segmentation (xia 2000b), pos tagging (xia 2000a) and bracketing guidelines (xue and xia 2000) and it will use articles from peoples' daily, hong kong newswire and material translated into chinese from other languages in addition to the xinhua newswire used in ctb-i in an effort to diversify the sources. the availability of ctb-i changed our approach to ctb-ii considerably. due to the existence of ctb-i, we were able to train new automatic chinese language processing (clp) tools, which crucially use annotated corpora as training material. these tools are then used for preprocessing in the development of the ctb-ii. we also developed tools to control the quality of the corpus. in this paper, we will address three issues in the development of the chinese treebank: annotation speed, annotation accuracy and usability of the corpus. specifically, we attempt to answer four questions: (i) how do we speed up the annotation process, (ii) how do we maintain high quality, i.e. annotation accuracy and inter-annotator consistency during the annotation process, and (iii) for what purposes is the corpus applicable, and (iv) what are our future plans? although we will touch upon linguistic problems that are specific to chinese, we believe these issues are general enough for the development of any single language corpus. 1 annotation speed. there are three main factors that affect the annotation speed : annotators? background, guideline design and more importantly, the availability of preprocessing tools. we will discuss how each of these three factors affects annotation speed. 1.1 annotator background. even with the best sets of guidelines, it is important that annotators have received considerable training in linguistics, particularly in syntax. in both the segmentation/pos tagging phase and the syntactic bracketing phase, understanding the structure of the sentences is essential for correct annotation with reasonable speed. for example, the penn chinese treebank (ctb) is an ongoing project, with its objective being to create a segmented chinese corpus annotated with pos tags and syntactic brackets. for example, in both the segmentation/pos tagging phase and the syntactic bracketing phase, understanding the structure of the sentences is essential for correct annotation with reasonable speed. the preliminary results of this phase of the project have been reported in xia et al (2000). even with the best sets of guidelines, it is important that annotators have received considerable training in linguistics, particularly in syntax. the first installment of the project (ctb-i) consists of xinhua newswire between the years 1994 and 1998, totaling 100,000 words, fully segmented, pos-tagged and syntactically bracketed and it has been released to the public via the penn linguistic data consortium (ldc). currently the second installment of the project, the 400,000-word ctb-ii is being developed and is expected to be completed early in the year 2003. 1.1 annotator background. we will discuss how each of these three factors affects annotation speed. the availability of ctb-i changed our approach to ctb-ii considerably. background, guideline design and more importantly, the availability of preprocessing tools. ctb-ii will follow the standards set up in the segmentation (xia 2000b), pos tagging (xia 2000a) and bracketing guidelines (xue and xia 2000) and it will use articles from peoples' daily, hong kong newswire and material translated into chinese from other languages in addition to the xinhua newswire used in ctb-i in an effort to diversify the sources. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-1150.txt b/DATASET_PACSUM/dataset/inputs/C02-1150.txt new file mode 100644 index 0000000000000000000000000000000000000000..607515ecf5480ec5989d9c9d4df267153828fe70 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-1150.txt @@ -0,0 +1 @@ +open-domain question answering (lehnert, 1986; harabagiu et al, 2001; light et al, 2001) and storycomprehension (hirschman et al, 1999) have become important directions in natural language pro cessing. question answering is a retrieval task morechallenging than common search engine tasks be cause its purpose is to find an accurate and conciseanswer to a question rather than a relevant docu ment. the difficulty is more acute in tasks such as story comprehension in which the target text is less likely to overlap with the text in the questions. for this reason, advanced natural language techniques rather than simple key term extraction are needed.one of the important stages in this process is analyz ing the question to a degree that allows determining the ?type? of the sought after answer. in the treccompetition (voorhees, 2000), participants are requested to build a system which, given a set of en glish questions, can automatically extract answers (a short phrase) of no more than 50 bytes from a5-gigabyte document library. participants have re research supported by nsf grants iis-9801638 and itr iis 0085836 and an onr muri award. alized that locating an answer accurately hinges on first filtering out a wide range of candidates (hovy et al, 2001; ittycheriah et al, 2001) based on some categorization of answer types. this work develops a machine learning approach to question classification (qc) (harabagiu et al, 2001; hermjakob, 2001). our goal is to categorize questions into different semantic classes that impose constraints on potential answers, so that they can be utilized in later stages of the question answeringprocess. for example, when considering the question q: what canadian city has the largest popula tion?, the hope is to classify this question as havinganswer type city, implying that only candidate an swers that are cities need consideration.based on the snow learning architecture, we develop a hierarchical classifier that is guided by a lay ered semantic hierarchy of answer types and is able to classify questions into fine-grained classes. wesuggest that it is useful to consider this classifica tion task as a multi-label classification and find that it is possible to achieve good classification results(over 90%) despite the fact that the number of dif ferent labels used is fairly large, 50. we observe thatlocal features are not sufficient to support this accu racy, and that inducing semantic features is crucial for good performance. the paper is organized as follows: sec. 2 presents the question classification problem; sec. 3 discusses the learning issues involved in qc and presents ourlearning approach; sec. 4 describes our experimen tal study.this paper presents a machine learning approach to question classification. 4 describes our experimen tal study. in future work we plan to investigate further the application of deeper semantic analysis (including better named entity and semantic categorization) to feature extraction, automate the generation of thesemantic features and develop a better understand ing to some of the learning issues involved in thedifference between a flat and a hierarchical classi fier. question answering is a retrieval task morechallenging than common search engine tasks be cause its purpose is to find an accurate and conciseanswer to a question rather than a relevant docu ment. we define question classification(qc) here to be the task that, given a question, maps it to one of k classes, which provide a semantic constraint on the sought-after answer1. open-domain question answering (lehnert, 1986; harabagiu et al, 2001; light et al, 2001) and storycomprehension (hirschman et al, 1999) have become important directions in natural language pro cessing. the ambiguity causes the classifier not to output equivalent term as the first choice. we designed two experiments to test the accuracy ofour classifier on trec questions. what do bats eat?. in this case, both classes are ac ceptable. the first experi ment evaluates the contribution of different featuretypes to the quality of the classification. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C02-2025.txt b/DATASET_PACSUM/dataset/inputs/C02-2025.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4f839364064fbf34af28cc6a2df4ad1ef3f58ae --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C02-2025.txt @@ -0,0 +1 @@ +for the past decade or more, symbolic, linguistically ori- ented methods and statistical or machine learning ap- proaches to nlp have often been perceived as incompat- ible or even competing paradigms. while shallow and probabilistic processing techniques have produced use- ful results in many classes of applications, they have not met the full range of needs for nlp, particularly where precise interpretation is important, or where the variety of linguistic expression is large relative to the amount of training data available. on the other hand, deep approaches to nlp have only recently achieved broad enough grammatical coverage and sufficient processing efficiency to allow the use of precise linguistic grammars in certain types of real-world applications. in particular, applications of broad-coverage analyti- cal grammars for parsing or generation require the use of sophisticated statistical techniques for resolving ambigu- ities; the transfer of head-driven phrase structure gram- mar (hpsg) systems into industry, for example, has am- plified the need for general parse ranking, disambigua- tion, and robust recovery techniques. we observe general consensus on the necessity for bridging activities, com- bining symbolic and stochastic approaches to nlp. but although we find promising research in stochastic pars- ing in a number of frameworks, there is a lack of appro- priately rich and dynamic language corpora for hpsg. likewise, stochastic parsing has so far been focussed on information-extraction-type applications and lacks any depth of semantic interpretation. the redwoods initia- tive is designed to fill in this gap. in the next section, we present some of the motivation for the lingo redwoods project as a treebank develop- ment process. although construction of the treebank is in its early stages, we present in section 3 some prelim- inary results of using the treebank data already acquired on concrete applications. we show, for instance, that even simple statistical models of parse ranking trained on the redwoods corpus built so far can disambiguate parses with close to 80% accuracy. 2 a rich and dynamic treebank the redwoods treebank is based on open-source hpsg resources developed by a broad consortium of re- search groups including researchers at stanford (usa), saarbru?cken (germany), cambridge, edinburgh, and sussex (uk), and tokyo (japan). their wide distribution and common acceptance make the hpsg framework and resources an excellent anchor point for the redwoods treebanking initiative. the key innovative aspect of the redwoods ap- proach to treebanking is the anchoring of all linguis- tic data captured in the treebank to the hpsg frame- work and a generally-available broad-coverage gram- mar of english, the lingo english resource grammar (flickinger, 2000) as implemented with the lkb gram- mar development environment (copestake, 2002). un- like existing treebanks, there is no need to define a (new) form of grammatical representation specific to the tree- bank.the lingo redwoods treebank motivation and preliminary applications stephan oepen, kristina toutanova, stuart shieber, christopher manning, dan flickinger, and thorsten brants {oe |kristina |manning |dan}@csli.stanford.edu, shieber@deas.harvard.edu, brants@parc.xerox.com abstract the lingo redwoods initiative is a seed activity in the de- sign and development of a new type of treebank. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1010.txt b/DATASET_PACSUM/dataset/inputs/C04-1010.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6ee1d34264f242a564a87f465a730773f58fb33 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1010.txt @@ -0,0 +1 @@ +there has been a steadily increasing interest in syntactic parsing based on dependency analysis in re cent years. one important reason seems to be thatdependency parsing offers a good compromise be tween the conflicting demands of analysis depth, on the one hand, and robustness and efficiency, on the other. thus, whereas a complete dependency structure provides a fully disambiguated analysisof a sentence, this analysis is typically less complex than in frameworks based on constituent analysis and can therefore often be computed determin istically with reasonable accuracy. deterministicmethods for dependency parsing have now been ap plied to a variety of languages, including japanese (kudo and matsumoto, 2000), english (yamada and matsumoto, 2003), turkish (oflazer, 2003), and swedish (nivre et al, 2004). for english, the interest in dependency parsing has been weaker than for other languages. to some extent, this can probably be explained by the strong tradition of constituent analysis in anglo-american linguistics, but this trend has been reinforced by the fact that the major treebank of american english,the penn treebank (marcus et al, 1993), is anno tated primarily with constituent analysis. on the other hand, the best available parsers trained on thepenn treebank, those of collins (1997) and charniak (2000), use statistical models for disambigua tion that make crucial use of dependency relations. moreover, the deterministic dependency parser of yamada and matsumoto (2003), when trained on the penn treebank, gives a dependency accuracy that is almost as good as that of collins (1997) and charniak (2000). the parser described in this paper is similar to that of yamada and matsumoto (2003) in that it uses a deterministic parsing algorithm in combination with a classifier induced from a treebank. however, there are also important differences between the twoapproaches. first of all, whereas yamada and matsumoto employs a strict bottom-up algorithm (es sentially shift-reduce parsing) with multiple passes over the input, the present parser uses the algorithmproposed in nivre (2003), which combines bottom up and top-down processing in a single pass in order to achieve incrementality. this also means that the time complexity of the algorithm used here is linearin the size of the input, while the algorithm of ya mada and matsumoto is quadratic in the worst case. another difference is that yamada and matsumoto use support vector machines (vapnik, 1995), whilewe instead rely on memory-based learning (daele mans, 1999). most importantly, however, the parser presented in this paper constructs labeled dependency graphs, i.e. dependency graphs where arcs are labeled with dependency types. as far as we know, this makesit different from all previous systems for dependency parsing applied to the penn treebank (eis ner, 1996; yamada and matsumoto, 2003), althoughthere are systems that extract labeled grammatical relations based on shallow parsing, e.g. buchholz (2002). the fact that we are working with labeled dependency graphs is also one of the motivations for choosing memory-based learning over sup port vector machines, since we require a multi-class classifier. even though it is possible to use svmfor multi-class classification, this can get cumber some when the number of classes is large. (for the the ? dep finger-pointing ? np-sbj has already ? advp begun ? vp . ? dep figure 1: dependency graph for english sentenceunlabeled dependency parser of yamada and matsumoto (2003) the classification problem only in volves three classes.) the parsing methodology investigated here haspreviously been applied to swedish, where promis ing results were obtained with a relatively smalltreebank (approximately 5000 sentences for train ing), resulting in an attachment score of 84.7% and a labeled accuracy of 80.6% (nivre et al, 2004).1 however, since there are no comparable resultsavailable for swedish, it is difficult to assess the significance of these findings, which is one of the reasons why we want to apply the method to a bench mark corpus such as the the penn treebank, even though the annotation in this corpus is not ideal for labeled dependency parsing.the paper is structured as follows. section 2 describes the parsing algorithm, while section 3 ex plains how memory-based learning is used to guidethe parser. experimental results are reported in sec tion 4, and conclusions are stated in section 5.the conversion of the penn tree bank to dependency trees has been performed using head rules kindly provided by hiroyasu yamada and yuji matsumoto. there has been a steadily increasing interest in syntactic parsing based on dependency analysis in re cent years. experimental results are reported in sec tion 4, and conclusions are stated in section 5. sentences whose unlabeled dependency structure is completely correct (yamada and mat sumoto, 2003). one important reason seems to be thatdependency parsing offers a good compromise be tween the conflicting demands of analysis depth, on the one hand, and robustness and efficiency, on the other. the memory-based classifiers used in the experiments have been constructed using thetilburg memory-based learner (timbl) (daelemans et al, 2003). first of all, we see that model 1 gives better accuracy than model 2 with the smaller label set g, which confirms our expectations that the added part-of-speech featuresare helpful when the dependency labels are less informative. acknowledgements the work presented in this paper has been supportedby a grant from the swedish research council (621 2002-4207). all metrics except cm are calculated as meanscores per word, and punctuation tokens are con sistently excluded.table 1 shows the attachment score, both unla beled and labeled, for the two different state models with the two different label sets. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1024.txt b/DATASET_PACSUM/dataset/inputs/C04-1024.txt new file mode 100644 index 0000000000000000000000000000000000000000..54be326d1fb0f9aee041735adb6323ab6ce373a3 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1024.txt @@ -0,0 +1 @@ +large context-free grammars extracted from tree banks achieve high coverage and accuracy, but they are difficult to parse with because of their massive ambiguity. the application of standard chart-parsing techniques often fails due to excessive memory and runtime requirements.treebank grammars are mostly used as probabilis tic grammars and users are usually only interested in the best analysis, the viterbi parse. to speed up viterbi parsing, sophisticated search strategies havebeen developed which find the most probable anal ysis without examining the whole set of possible analyses (charniak et al, 1998; klein and manning,2003a). these methods reduce the number of gener ated edges, but increase the amount of time needed for each edge. the parser described in this paper follows a contrary approach: instead of reducing the number of edges, it minimises the costs of building edges in terms of memory and runtime.the new parser, called bitpar, is based on a bit vector implementation (cf. (graham et al, 1980)) of the well-known cocke-younger-kasami (cky) algorithm (kasami, 1965; younger, 1967). it buildsa compact ?parse forest? representation of all anal yses in two steps. in the first step, a cky-style recogniser fills the chart with constituents. in the second step, the parse forest is built top-down from the chart. viterbi parses are computed in four steps. again, the first step is a cky recogniser which is followed by a top-down filtering of the chart, the bottom-up computation of the viterbi probabilities, and the top-down extraction of the best parse.the rest of the paper is organised as follows: sec tion 2 explains the transformation of the grammar to chomsky normal form. the following sectionsdescribe the recogniser algorithm (sec. 3), improvements of the recogniser by means of bit-vector op erations (sec. 4), and the generation of parse forests(sec. 5), and viterbi parses (sec. 6). section 7 discusses the advantages of the new architecture, sec tion 8 describes experimental results, and section 9 summarises the paper.(the rule a section 7 discusses the advantages of the new architecture, sec tion 8 describes experimental results, and section 9 summarises the paper. the cky algorithm requires a grammar in chom sky normal form where the right-hand side of eachrule either consists of two non-terminals or a single terminal symbol. large context-free grammars extracted from tree banks achieve high coverage and accuracy, but they are difficult to parse with because of their massive ambiguity. 5), and viterbi parses (sec. the application of standard chart-parsing techniques often fails due to excessive memory and runtime requirements.treebank grammars are mostly used as probabilis tic grammars and users are usually only interested in the best analysis, the viterbi parse. boring symbols on the right-hand sides of rules. bitpar uses a modified ver sion of the cky algorithm allowing also chain rules (rules with a single non-terminal on the right-handside). 4), and the generation of parse forests(sec. to speed up viterbi parsing, sophisticated search strategies havebeen developed which find the most probable anal ysis without examining the whole set of possible analyses (charniak et al, 1998; klein and manning,2003a). 3), improvements of the recogniser by means of bit-vector op erations (sec. these methods reduce the number of gener ated edges, but increase the amount of time needed for each edge. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1041.txt b/DATASET_PACSUM/dataset/inputs/C04-1041.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd2d5d586f4dc1b4508f27a3d136199804539500 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1041.txt @@ -0,0 +1 @@ +lexicalised grammar formalisms such as lexicalized tree adjoining grammar (ltag) and com binatory categorial grammar (ccg) assign one or more syntactic structures to each word in a sentencewhich are then manipulated by the parser. supertag ging was introduced for ltag as a way of increasingparsing efficiency by reducing the number of struc tures assigned to each word (bangalore and joshi, 1999). supertagging has more recently been applied to ccg (clark, 2002; curran and clark, 2003).supertagging accuracy is relatively high for man ually constructed ltags (bangalore and joshi,1999). however, for ltags extracted automati cally from the penn treebank, performance is much lower (chen et al, 1999; chen et al, 2002). in fact, performance for such grammars is below that needed for successful integration into a full parser (sarkar et al, 2000). in this paper we demonstratethat ccg supertagging accuracy is not only sufficient for accurate and robust parsing using an auto matically extracted grammar, but also offers several practical advantages. our wide-coverage ccg parser uses a log-linear model to select an analysis. the model paramaters are estimated using a discriminative method, that is,one which requires all incorrect parses for a sentence as well as the correct parse. since an auto matically extracted ccg grammar can produce anextremely large number of parses, the use of a su pertagger is crucial in limiting the total number of parses for the training data to a computationally manageable number. the supertagger is also crucial for increasing thespeed of the parser. we show that spectacular in creases in speed can be obtained, without affectingaccuracy or coverage, by tightly integrating the su pertagger with the ccg grammar and parser. to achieve maximum speed, the supertagger initially assigns only a small number of ccg categories toeach word, and the parser only requests more cate gories from the supertagger if it cannot provide an analysis. we also demonstrate how extra constraints on the category combinations, and the application of beam search using the parsing model, can further increase parsing speed.this is the first work we are aware of to succes fully integrate a supertagger with a full parser which uses a lexicalised grammar automatically extractedfrom the penn treebank. we also report signifi cantly higher parsing speeds on newspaper text than any previously reported for a full wide-coverage parser. our results confirm that wide-coverage ccg parsing is feasible for many large-scale nlp tasks.this research was supported by epsrc grant gr/m96889, and a commonwealth scholarship and a sydney university travelling scholarship to the second author. this paper has shown that by tightly integrating a supertagger with a ccg parser, very fast parse times can be achieved for penn treebank wsj text. our results confirm that wide-coverage ccg parsing is feasible for many large-scale nlp tasks. lexicalised grammar formalisms such as lexicalized tree adjoining grammar (ltag) and com binatory categorial grammar (ccg) assign one or more syntactic structures to each word in a sentencewhich are then manipulated by the parser. supertag ging was introduced for ltag as a way of increasingparsing efficiency by reducing the number of struc tures assigned to each word (bangalore and joshi, 1999). the best speeds we have reported for the ccg parser are an order of magnitude faster. to give one example, the number of categories in the tag dictionary?s entry for the wordis is 45 (only considering categories which have appeared at least 10 times in the training data). we also report signifi cantly higher parsing speeds on newspaper text than any previously reported for a full wide-coverage parser. however, in the sentence mr. vinken is chairman of elsevier n.v., the dutch publishing group., the supertag ger correctly assigns 1 category to is for ? = 0.1, and 3 categories for ? = 0.01. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1051.txt b/DATASET_PACSUM/dataset/inputs/C04-1051.txt new file mode 100644 index 0000000000000000000000000000000000000000..a04617637ab829676b3123c244dab58633a09f6a --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1051.txt @@ -0,0 +1 @@ +the importance of learning to manipulate monolingual paraphrase relationships for applications like summarization, search, and dialog has been highlighted by a number of recent efforts (barzilay & mckeown 2001; shinyama et al 2002; lee & barzilay 2003; lin & pantel 2001). while several different learning methods have been applied to this problem, all share a need for large amounts of data in the form of pairs or sets of strings that are likely to exhibit lexical and/or structural paraphrase alternations. one approach1 1 an alternative approach involves identifying anchor points--pairs of words linked in a known way--and collecting the strings that intervene. (shinyama, et al 2002; lin & pantel 2001). since our interest is in that has been successfully used is edit distance, a measure of similarity between strings. the assumption is that strings separated by a small edit distance will tend to be similar in meaning: the leading indicators measure the economy? the leading index measures the economy?. lee & barzilay (2003), for example, use multi sequence alignment (msa) to build a corpus of paraphrases involving terrorist acts. their goal is to extract sentential templates that can be used in high-precision generation of paraphrase alter nations within a limited domain. our goal here is rather different: our interest lies in constructing a monolingual broad-domain corpus of pairwise aligned sentences. such data would be amenable to conventional statistical machine translation (smt) techniques (e.g., those discussed in och & ney 2003).2 in what follows we compare two strategies for unsupervised construction of such a corpus, one employing string similarity and the other associating sentences that may overlap very little at the string level. we measure the relative utility of the two derived monolingual corpora in the context of word alignment techniques developed originally for bilingual text. we show that although the edit distance corpus is well-suited as training data for the alignment algorithms currently used in smt, it is an incomplete source of information about paraphrase relations, which exhibit many of the characteristics of comparable bilingual corpora or free translations. many of the more complex alternations that characterize monolingual paraphrase, such as large-scale lexical alternations and constituent reorderings, are not readily learning sentence level paraphrases, including major constituent reorganizations, we do not address this approach here. 2 barzilay & mckeown (2001) consider the possibility of using smt machinery, but reject the idea because of the noisy, comparable nature of their dataset. captured by edit distance techniques, which conflate semantic similarity with formal similarity. we conclude that paraphrase research would benefit by identifying richer data sources and developing appropriate learning techniques.we remain, however, responsible for all content. edit distance identifies sentence pairs that exhibit lexical and short phrasal alternations that can be aligned with considerable success. we conclude that paraphrase research would benefit by identifying richer data sources and developing appropriate learning techniques. we have also benefited from discussions with ken church, mark johnson, daniel marcu and franz och. the importance of learning to manipulate monolingual paraphrase relationships for applications like summarization, search, and dialog has been highlighted by a number of recent efforts (barzilay & mckeown 2001; shinyama et al 2002; lee & barzilay 2003; lin & pantel 2001). given a large dataset and a well-motivated clustering of documents, useful datasets can be gleaned even without resorting to more sophisticated techniques figure 2. captured by edit distance techniques, which conflate semantic similarity with formal similarity. the second relied on a discourse-based heuristic, specific to the news genre, to identify likely paraphrase pairs even when they have little superficial similarity. while several different learning methods have been applied to this problem, all share a need for large amounts of data in the form of pairs or sets of strings that are likely to exhibit lexical and/or structural paraphrase alternations. our two paraphrase datasets are distilled from a corpus of news articles gathered from thousands of news sources over an extended period. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1059.txt b/DATASET_PACSUM/dataset/inputs/C04-1059.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ffe79b74a19fc030e71fc349c6741f95f45f22b --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1059.txt @@ -0,0 +1 @@ +language models (lm) are applied in many natural language processing applications, such as speech recognition and machine translation, to encapsulate syntactic, semantic and pragmatic information. for systems which learn from given data we frequently observe a severe drop in performance when moving to a new genre or new domain. in speech recognition a number of adaptation techniques have been developed to cope with this situation. in statistical machine translation we have a similar situation, i.e. estimate the model parameter from some data, and use the system to translate sentences which may not be well covered by the training data. therefore, the potential of adaptation techniques needs to be explored for machine translation applications. statistical machine translation is based on the noisy channel model, where the translation hypothesis is searched over the space defined by a translation model and a target language (brown et al, 1993). statistical machine translation can be formulated as follows: )()|(maxarg)|(maxarg* tptspstpt tt ?== where t is the target sentence, and s is the source sentence. p(t) is the target language model and p(s|t) is the translation model. the argmax operation is the search, which is done by the decoder. in the current study we modify the target language model p(t), to represent the test data better, and thereby improve the translation quality. (janiszek, et al 2001) list the following approaches to language model adaptation: ? linear interpolation of a general and a domain specific model (seymore, rosenfeld, 1997). back off of domain specific probabilities with those of a specific model (besling, meier, 1995). retrieval of documents pertinent to the new domain and training a language model on-line with those data (iyer, ostendorf, 1999, mahajan et. al. 1999). maximum entropy, minimum discrimination adaptation (chen, et. al., 1998). adaptation by linear transformation of vectors of bigram counts in a reduced space (demori, federico, 1999). smoothing and adaptation in a dual space via latent semantic analysis, modeling long-term semantic dependencies, and trigger combinations. (j. bellegarda, 2000). our approach can be characterized as unsupervised data augmentation by retrieval of relevant documents from large monolingual corpora, and interpolation of the specific language model, build from the retrieved data, with a background language model. to be more specific, the following steps are carried out to do the language model adaptation. first, a baseline statistical machine translation system, using a large general language model, is applied to generate initial translations. then these translations hypotheses are reformulated as queries to retrieve similar sentences from a very large text collection. a small domain specific language model is build using the retrieved sentences and linearly interpolated with the background language model. this new interpolated language model in applied in a second decoding run to produce the final translations. there are a number of interesting questions pertaining to this approach: ? which information can and should used to generate the queries: the first-best translation only, or also translation alternatives. how should we construct the queries, just as simple bag-of-words, or can we incorporate more structure to make them more powerful. how many documents should be retrieved to build the specific language models, and on what granularity should this be done, i.e. what is a document in the information retrieval process. the paper is structured as follows: section 2 outlines the sentence retrieval approach, and three bag-of-words query models are designed and explored; structured query models are introduced in section 3. in section 4 we present translation experiments are presented for the different query. finally, summary is given in section 5.in this paper, we studied language model adaptation for statistical machine translation. this might be especially useful for structured query models generated from the translation lattices. finally, summary is given in section 5. language models (lm) are applied in many natural language processing applications, such as speech recognition and machine translation, to encapsulate syntactic, semantic and pragmatic information. in section 4 we present translation experiments are presented for the different query. for systems which learn from given data we frequently observe a severe drop in performance when moving to a new genre or new domain. the paper is structured as follows: section 2 outlines the sentence retrieval approach, and three bag-of-words query models are designed and explored; structured query models are introduced in section 3. in speech recognition a number of adaptation techniques have been developed to cope with this situation. our language model adaptation is an unsupervised data augmentation approach guided by query models. on the other side the oracle experiment also shows that the optimally expected improvement is limited by the translation model and decoding algorithm used in the current smt system. this also means tmq is subject to more noise. experiments are carried out on a standard statistical machine translation task defined in the nist evaluation in june 2002. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1072.txt b/DATASET_PACSUM/dataset/inputs/C04-1072.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b6ff3685c2644f477fea5bdb49fa6722694cbf6 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1072.txt @@ -0,0 +1 @@ +to automatically evaluate machine translations, the machine translation community recently adopted an n-gram co-occurrence scoring procedure bleu (papineni et al 2001). a similar metric, nist, used by nist (nist 2002) in a couple of machine translation evaluations in the past two years is based on bleu. the main idea of bleu is to measure the translation closeness between a candidate translation and a set of reference translations with a numerical metric. although the idea of using objective functions to automatically evaluate machine translation quality is not new (su et al 1992), the success of bleu prompts a lot of interests in developing better automatic evaluation metrics. for example, akiba et al (2001) proposed a metric called red based on edit distances over a set of multiple references. nie?en et al (2000) calculated the length normalized edit distance, called word error rate (wer), between a candidate and multiple reference translations. leusch et al (2003) proposed a related measure called position independent word error rate (per) that did not consider word position, i.e. using bag-of-words instead. turian et al (2003) introduced general text matcher (gtm) based on accuracy measures such as recall, precision, and f-measure. with so many different automatic metrics available, it is necessary to have a common and objective way to evaluate these metrics. comparison of automatic evaluation metrics are usually conducted on corpus level using correlation analysis between human scores and automatic scores such as bleu, nist, wer, and per. however, the performance of automatic metrics in terms of human vs. system correlation analysis is not stable across different evaluation settings. for example, table 1 shows the pearson?s linear correlation coefficient analysis of 8 machine translation systems from 2003 nist chinese english machine translation evaluation. the pearson? correlation coefficients are computed according to different automatic evaluation methods vs. human assigned adequacy and fluency. bleu1, 4, and 12 are bleu with maximum n-gram lengths of 1, 4, and 12 respectively. gtm10, 20, and 30 are gtm with exponents of 1.0, 2.0, and 3.0 respectively. 95% confidence intervals are estimated using bootstrap resampling (davison and hinkley 1997). from the bleu group, we found that shorter bleu has better adequacy correlation while longer bleu has better fluency correlation. gtm with smaller exponent has better adequacy correlation and gtm with larger exponent has better fluency correlation. nist is very good in adequacy correlation but not as good as gtm30 in fluency correlation. based on these observations, we are not able to conclude which metric is the best because it depends on the manual evaluation criteria. this results also indicate that high correlation between human and automatic scores in both adequacy and fluency cannot always been achieved at the same time. the best performing metrics in fluency according to table 1 are bleu12 and gtm30 (dark/green cells). however, many metrics are statistically equivalent (gray cells) to them when we factor in the 95% confidence intervals. for example, even per is as good as bleu12 in adequacy. one reason for this might be due to data sparseness since only 8 systems are available. the other potential problem for correlation analysis of human vs. automatic framework is that high corpus-level correlation might not translate to high sentence-level correlation. however, high sentence-level correlation is often an important property that machine translation researchers look for. for example, candidate translations shorter than 12 words would have zero bleu12 score but bleu12 has the best correlation with human judgment in fluency as shown in table 1. in order to evaluate the ever increasing number of automatic evaluation metrics for machine translation objectively, efficiently, and reliably, we introduce a new evaluation method: orange. we describe orange in details in section 2 and briefly introduce three new automatic metrics that will be used in comparisons in section 3. the results of comparing several existing automatic metrics and the three new automatic metrics using orange will be presented in section 4. we conclude this paper and discuss future directions in section 5.we conclude this paper and discuss future directions in section 5. however, we plan to conduct the sampling procedure to verify this is indeed the case. we conjecture that this is the case for the currently available machine translation systems. the orange score for each metric is calculated as the average rank of the average reference (oracle) score over the whole corpus (872 sentences) divided by the length of the n-best list plus 1. the results of comparing several existing automatic metrics and the three new automatic metrics using orange will be presented in section 4. if the portion is small then the orange method can be confidently applied. to automatically evaluate machine translations, the machine translation community recently adopted an n-gram co-occurrence scoring procedure bleu (papineni et al 2001). assuming the length of the n-best list is n and the size of the corpus is s (in number of sentences), we compute orange as follows: orange = )1( )( 1 + ??? ranging from 0 to 9 (rouge-s0 to s9) and without any skip distance limit (rouge-s*) we compute the average score of the references and then rank the candidate translations and the references according to these automatic scores. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1080.txt b/DATASET_PACSUM/dataset/inputs/C04-1080.txt new file mode 100644 index 0000000000000000000000000000000000000000..d430700648cf2c76c1fc2fe178cc7c508e15cc9f --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1080.txt @@ -0,0 +1 @@ +the empiricist revolution in computational linguistics has dramatically shifted the accepted boundary between what kinds of knowledge are best supplied by humans and what kinds are best learned from data, with much of the human supplied knowledge now being in the form of annotations of data. as we look to the future, we expect that relatively unsupervised methods will grow in applicability, reducing the need for expensive human annotation of data. with respect to part-of-speech tagging, we believe that the way forward from the relatively small number of languages for which we can currently identify parts of speech in context with reasonable accuracy will make use of unsupervised methods that require only an untagged corpus and a lexicon of words and their possible parts of speech. we believe this based on the fact that such lexicons exist for many more languages (in the form of conventional dictionaries) than extensive human-tagged training corpora exist for. unsupervised part-of-speech tagging, as defined above, has been attempted using a variety of learning algorithms (brill 1995, church, 1988, cutting et. al. 1992, elworthy, 1994 kupiec 1992, merialdo 1991). while this makes unsupervised part-of-speech tagging a relatively well-studied problem, published results to date have not been comparable with respect to the training and test data used, or the lexicons which have been made available to the learners. in this paper, we provide the first comprehensive comparison of methods for unsupervised part-of speech tagging. in addition, we explore two new ideas for improving tagging accuracy. first, we explore an hmm approach to tagging that uses context on both sides of the word to be tagged, inspired by previous work on building bidirectionality into graphical models (lafferty et. al. 2001, toutanova et. al. 2003). second we describe a method for sequential unsupervised training of tag sequence and lexical probabilities in an hmm, which we observe leads to improved accuracy over simultaneous training with certain types of models. in section 2, we provide a brief description of the methods we evaluate and review published results. section 3 describes the contextualized variation on hmm tagging that we have explored. in section 4 we provide a direct comparison of several unsupervised part-of-speech taggers, which is followed by section 5, in which we present a new method for training with suboptimal lexicons. in section 6, we revisit our new approach to hmm tagging, this time, in the supervised framework.in the future, we will consider making an increase the context-size, which helped toutanova et al (2003). in section 6, we revisit our new approach to hmm tagging, this time, in the supervised framework. the empiricist revolution in computational linguistics has dramatically shifted the accepted boundary between what kinds of knowledge are best supplied by humans and what kinds are best learned from data, with much of the human supplied knowledge now being in the form of annotations of data. this result falls only slightly below the full-blown training intensive dependency-based conditional model. we have presented a comprehensive evaluation of several methods for unsupervised part-of-speech tagging, comparing several variations of hidden markov model taggers and unsupervised transformation-based learning using the same corpus and same lexicons. in section 4 we provide a direct comparison of several unsupervised part-of-speech taggers, which is followed by section 5, in which we present a new method for training with suboptimal lexicons. using a 50% 50% train-test split of the penn treebank to assess hmms, maximum entropy markov models (memms) and conditional random fields (crfs), they found that crfs, which make use of observation features from both the past and future, outperformed hmms which in turn outperformed memms. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1081.txt b/DATASET_PACSUM/dataset/inputs/C04-1081.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b8ff9f1f58d74eea103c07d3dc76039cc9b67e6 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1081.txt @@ -0,0 +1 @@ +unlike english and other western languages, many asian languages such as chinese, japanese, and thai, do not delimit words by white-space. wordsegmentation is therefore a key precursor for language processing tasks in these languages. for chinese, there has been significant research on find ing word boundaries in unsegmented sequences(see (sproat and shih, 2002) for a review). un fortunately, building a chinese word segmentation system is complicated by the fact that there is no standard definition of word boundaries in chinese. approaches to chinese segmentation fall roughly into two categories: heuristic dictionary-based methods and statistical machine learning methods.in dictionary-based methods, a predefined dictio nary is used along with hand-generated rules for segmenting input sequence (wu, 1999). howeverthese approaches have been limited by the impossibility of creating a lexicon that includes all possible chinese words and by the lack of robust statistical inference in the rules. machine learning approaches are more desirable and have been successful in both unsupervised learning (peng and schuur mans, 2001) and supervised learning (teahan et al, 2000). many current approaches suffer from either lackof exact inference over sequences or difficulty in incorporating domain knowledge effectively into seg mentation. domain knowledge is either not used, used in a limited way, or used in a complicated way spread across different components. for example,the n-gram generative language modeling based ap proach of teahan et al(2000) does not use domainknowledge. gao et al(2003) uses class-based language for word segmentation where some word cat egory information can be incorporated. zhang et al (2003) use a hierarchical hidden markov model to incorporate lexical knowledge. a recent advance in this area is xue (2003), in which the author uses a sliding-window maximum entropy classifier to tag chinese characters into one of four position tags, and then covert these tags into a segmentation using rules. maximum entropy models give tremendousflexibility to incorporate arbitrary features. how ever, a traditional maximum entropy tagger, as used in xue (2003), labels characters without consideringdependencies among the predicted segmentation labels that is inherent in the state transitions of finite state sequence models. linear-chain conditional random fields (crfs) (lafferty et al, 2001) are models that address both issues above. unlike heuristic methods, they are principled probabilistic finite state models onwhich exact inference over sequences can be ef ficiently performed. unlike generative n-gram or hidden markov models, they have the ability to straightforwardly combine rich domain knowledge, for example in this paper, in the form of multiple readily-available lexicons. furthermore, they arediscriminatively-trained, and are often more accurate than generative models, even with the same fea tures. in their most general form, crfs are arbitrary undirected graphical models trained to maximize the conditional probability of the desired outputs given the corresponding inputs. in the linear-chainspecial case we use here, they can be roughly un derstood as discriminatively-trained hidden markovmodels with next-state transition functions represented by exponential models (as in maximum en tropy classifiers), and with great flexibility to viewthe observation sequence in terms of arbitrary, over lapping features, with long-range dependencies, and at multiple levels of granularity. these beneficialproperties suggests that crfs are a promising ap proach for chinese word segmentation.new word detection is one of the most impor tant problems in chinese information processing.many machine learning approaches have been pro posed (chen and bai, 1998; wu and jiang, 2000; nie et al, 1995). new word detection is normally considered as a separate process from segmentation.however, integrating them would benefit both seg mentation and new word detection. crfs provide aconvenient framework for doing this. they can pro duce not only a segmentation, but also confidence in local segmentation decisions, which can be usedto find new, unfamiliar character sequences sur rounded by high-confidence segmentations. thus, our new word detection is not a stand-alone process, but an integral part of segmentation. newly detected words are re-incorporated into our word lexicon,and used to improve segmentation. improved seg mentation can then be further used to improve new word detection. comparing chinese word segmentation accuracyacross systems can be difficult because many re search papers use different data sets and different ground-rules. some published results claim 98% or99% segmentation precision and recall, but these ei ther count only the words that occur in the lexicon, or use unrealistically simple data, lexicons that haveextremely small (or artificially non-existant) outof-vocabulary rates, short sentences or many numbers. a recent chinese word segmentation competition (sproat and emerson, 2003) has made compar isons easier. the competition provided four datasets with significantly different segmentation guidelines, and consistent train-test splits. the performance ofparticipating system varies significantly across different datasets. our system achieves top performance in two of the runs, and a state-of-the-art per formance on average. this indicates that crfs are a viable model for robust chinese word segmentation.this indicates that crfs are a viable model for robust chinese word segmentation. unlike english and other western languages, many asian languages such as chinese, japanese, and thai, do not delimit words by white-space. wordsegmentation is therefore a key precursor for language processing tasks in these languages. the contribution of this paper is three-fold. our system achieves top performance in two of the runs, and a state-of-the-art per formance on average. for chinese, there has been significant research on find ing word boundaries in unsegmented sequences(see (sproat and shih, 2002) for a review). the performance ofparticipating system varies significantly across different datasets. acknowledgmentsthis work was supported in part by the center for intelligent information retrieval, in part by the cen tral intelligence agency, the national security agencyand national science foundation under nsf grant #iis 0326249, and in part by spawarsyscen-sd grant number n66001-02-1-8903. to make a comprehensive evaluation, we use allfour of the datasets from a recent chinese word segmentation bake-off competition (sproat and emer son, 2003). conditional random fields (crfs) are undirected graphical models trained to maximize a conditional probability (lafferty et al, 2001). however, training is a one-time process, and testing time is still linear in the length of the input. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1111.txt b/DATASET_PACSUM/dataset/inputs/C04-1111.txt new file mode 100644 index 0000000000000000000000000000000000000000..e748a9868f9512000fd969b5f3cbc65fd54b1a65 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1111.txt @@ -0,0 +1 @@ +the natural language processing (nlp) com munity has recently seen a growth in corpus-based methods. algorithms light in linguistic theories but rich in available training data have been successfully applied to several applications such as ma chine translation (och and ney 2002), information extraction (etzioni et al 2004), and question an swering (brill et al 2001). in the last decade, we have seen an explosion in the amount of available digital text resources. it is estimated that the internet contains hundreds of terabytes of text data, most of which is in an unstructured format. yet, many nlp algorithms tap into only megabytes or gigabytes of this information. in this paper, we make a step towards acquiring semantic knowledge from terabytes of data. we present an algorithm for extracting is-a relations, designed for the terascale, and compare it to a state of the art method that employs deep analysis of text (pantel and ravichandran 2004). we show that by simply utilizing more data on this task, we can achieve similar performance to a linguisticallyrich approach. the current state of the art co occurrence model requires an estimated 10 years just to parse a 1tb corpus (see table 1). instead of using a syntactically motivated co-occurrence ap proach as above, our system uses lexico-syntactic rules. in particular, it finds lexico-pos patterns by making modifications to the basic edit distance algorithm. once these patterns have been learnt, the algorithm for finding new is-a relations runs in o(n), where n is the number of sentences. in semantic hierarchies such as wordnet (miller 1990), an is-a relation between two words x and y represents a subordinate relationship (i.e. x is more specific than y). many algorithms have recently been proposed to automatically mine is-a (hypo nym/hypernym) relations between words. here, we focus on is-a relations that are characterized by the questions ?what/who is x?? for example, table 2 shows a sample of 10 is-a relations discovered by the algorithms presented in this paper. in this table, we call azalea, tiramisu, and winona ryder in stances of the respective concepts flower, dessert and actress. these kinds of is-a relations would be useful for various purposes such as ontology con struction, semantic information retrieval, question answering, etc. the main contribution of this paper is a comparison of the quality of our pattern-based and co occurrence models as a function of processing time and corpus size. also, the paper lays a foundation for terascale acquisition of knowledge. we will show that, for very small or very large corpora or for situations where recall is valued over precision, the pattern-based approach is best.there is a long standing need for higher quality performance in nlp systems. the natural language processing (nlp) com munity has recently seen a growth in corpus-based methods. our biggest challenge as we venture to the terascale is to use our new found wealth not only to build better systems, but to im prove our understanding of language. we will show that, for very small or very large corpora or for situations where recall is valued over precision, the pattern-based approach is best. also, the paper lays a foundation for terascale acquisition of knowledge. previous approaches to extracting is-a relations fall under two categories: pattern-based and co occurrence-based approaches. re cently, pantel and ravichandran (2004) extended this approach by making use of all syntactic de pendency features for each noun. there is promise for increasing our system accuracy by re ranking the outputs of the top-5 hypernyms. the per formance of the system in the top 5 category is much better than that of wordnet (38%). the focus is on the precision and recall of the systems as a func tion of the corpus size. algorithms light in linguistic theories but rich in available training data have been successfully applied to several applications such as ma chine translation (och and ney 2002), information extraction (etzioni et al 2004), and question an swering (brill et al 2001). \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1146.txt b/DATASET_PACSUM/dataset/inputs/C04-1146.txt new file mode 100644 index 0000000000000000000000000000000000000000..062493d111845412296f99f9b11c4a666ec88435 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1146.txt @@ -0,0 +1 @@ +over recent years, many natural language pro cessing (nlp) techniques have been developedthat might benefit from knowledge of distribu tionally similar words, i.e., words that occur in similar contexts. for example, the sparse dataproblem can make it difficult to construct language models which predict combinations of lex ical events. similarity-based smoothing (brown et al, 1992; dagan et al, 1999) is an intuitivelyappealing approach to this problem where prob abilities of unseen co-occurrences are estimatedfrom probabilities of seen co-occurrences of dis tributionally similar events.other potential applications apply the hy pothesised relationship (harris, 1968) betweendistributional similarity and semantic similar ity; i.e., similarity in the meaning of words can be predicted from their distributional similarity.one advantage of automatically generated the sauruses (grefenstette, 1994; lin, 1998; curranand moens, 2002) over large-scale manually cre ated thesauruses such as wordnet (fellbaum,1998) is that they might be tailored to a partic ular genre or domain.however, due to the lack of a tight defini tion for the concept of distributional similarity and the broad range of potential applications, alarge number of measures of distributional similarity have been proposed or adopted (see section 2). previous work on the evaluation of dis tributional similarity methods tends to either compare sets of distributionally similar words to a manually created semantic resource (lin, 1998; curran and moens, 2002) or be orientedtowards a particular task such as language mod elling (dagan et al, 1999; lee, 1999). the first approach is not ideal since it assumes that the goal of distributional similarity methods is topredict semantic similarity and that the semantic resource used is a valid gold standard. further, the second approach is clearly advanta geous when one wishes to apply distributional similarity methods in a particular application area. however, it is not at all obvious that oneuniversally best measure exists for all applica tions (weeds and weir, 2003). thus, applying adistributional similarity technique to a new ap plication necessitates evaluating a large number of distributional similarity measures in addition to evaluating the new model or algorithm. we propose a shift in focus from attemptingto discover the overall best distributional sim ilarity measure to analysing the statistical and linguistic properties of sets of distributionally similar words returned by different measures. this will make it possible to predict in advanceof any experimental evaluation which distributional similarity measures might be most appro priate for a particular application. further, we explore a problem faced by the automatic thesaurus generation community, which is that distributional similarity methodsdo not seem to offer any obvious way to distinguish between the semantic relations of syn onymy, antonymy and hyponymy. previous work on this problem (caraballo, 1999; lin et al., 2003) involves identifying specific phrasal patterns within text e.g., ?xs and other ys? is used as evidence that x is a hyponym of y. our work explores the connection between relativefrequency, distributional generality and seman tic generality with promising results. the rest of this paper is organised as follows.in section 2, we present ten distributional simi larity measures that have been proposed for use in nlp. in section 3, we analyse the variation in neighbour sets returned by these measures. in section 4, we take one fundamental statisticalproperty (word frequency) and analyse correla tion between this and the nearest neighbour setsgenerated. in section 5, we relate relative fre quency to a concept of distributional generalityand the semantic relation of hyponymy. in sec tion 6, we consider the effects that this has on a potential application of distributional similarity techniques, which is judging compositionality of collocations.in sec tion 6, we consider the effects that this has on a potential application of distributional similarity techniques, which is judging compositionality of collocations. over recent years, many natural language pro cessing (nlp) techniques have been developedthat might benefit from knowledge of distribu tionally similar words, i.e., words that occur in similar contexts. we would liketo thank adam kilgarriff and bill keller for use ful discussions. we have presented an analysis of a set of dis tributional similarity measures. for example, the sparse dataproblem can make it difficult to construct language models which predict combinations of lex ical events. in section 5, we relate relative fre quency to a concept of distributional generalityand the semantic relation of hyponymy. in its most general sense, a collocation is a habitual or lexicalised word combination. thus, it would seem that the three-way connection betweendistributional generality, hyponymy and rela tive frequency exists for verbs as well as nouns. we have seen that there is a large amount of variation in the neighbours selected by different measures andtherefore the choice of measure in a given appli cation is likely to be important. mean simhm(w2, w1) = 2.simp (w2,w1).simr(w2,w1) simp (w2,w1)+simr(w2,w1) where f (w) = {c : i(c, w) > 0} table 1: ten distributional similarity measures their harmonic mean (or f-score). \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1180.txt b/DATASET_PACSUM/dataset/inputs/C04-1180.txt new file mode 100644 index 0000000000000000000000000000000000000000..40edcb4ed683736d597471c80dee3fa422a81762 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1180.txt @@ -0,0 +1 @@ +the levels of accuracy and robustness recently achieved by statistical parsers (e.g. collins (1999),charniak (2000)) have led to their use in a num ber of nlp applications, such as question-answering(pasca and harabagiu, 2001), machine translation (charniak et al, 2003), sentence simplifica tion (carroll et al, 1999), and a linguist?s search engine (resnik and elkiss, 2003). such parsers typically return phrase-structure trees in the styleof the penn treebank, but without traces and co indexation. however, the usefulness of this outputis limited, since the underlying meaning (as repre sented in a predicate-argument structure or logical form) is difficult to reconstruct from such skeletal parse trees.in this paper we demonstrate how a widecoverage statistical parser using combinatory categorial grammar (ccg) can be used to generate semantic representations. there are a number of ad vantages to using ccg for this task. first, ccg provides ?surface compositional? analysis of certainsyntactic phenomena such as coordination and ex traction, allowing the logical form to be obtained for such cases in a straightforward way. second, ccg isa lexicalised grammar, and only uses a small num ber of semantically transparent combinatory rules tocombine ccg categories. hence providing a compositional semantics for ccg simply amounts to assigning semantic representations to the lexical en tries and interpreting the combinatory rules. andthird, there exist highly accurate, efficient and ro bust ccg parsers which can be used directly for this task (clark and curran, 2004b; hockenmaier, 2003).the existing ccg parsers deliver predicate argu ment structures, but not semantic representations that can be used for inference. the present paper seeks to extend one of these wide coverage parsers by using it to build logical forms suitable for use invarious nlp applications that require semantic in terpretation.we show how to construct first-order represen tations from ccg derivations using the ?-calculus, and demonstrate that semantic representations can be produced for over 97% of the sentences in unseen wsj text. the only other deep parser we are aware of to achieve such levels of robustness for the wsj is kaplan et al (2004). the use of the ?-calculusis integral to our method. however, first-order rep resentations are simply used as a proof-of-concept; we could have used drss (kamp and reyle, 1993)or some other representation more tailored to the ap plication in hand.there is some existing work with a similar motivation to ours. briscoe and carroll (2002) gen erate underspecified semantic representations fromtheir robust parser. toutanova et al (2002) and ka plan et al (2004) combine statistical methods with a linguistically motivated grammar formalism (hpsg and lfg respectively) in an attempt to achieve levels of robustness and accuracy comparable to the penn treebank parsers (which kaplan et al do achieve). however, there is a key difference between these approaches and ours. in our approach the creation of the semantic representations forms a completely it could cost taxpayers 15 million to install and residents 1 million a year to maintain np in our approach the creation of the semantic representations forms a completely it could cost taxpayers 15 million to install and residents 1 million a year to maintain np the levels of accuracy and robustness recently achieved by statistical parsers (e.g. collins (1999),charniak (2000)) have led to their use in a num ber of nlp applications, such as question-answering(pasca and harabagiu, 2001), machine translation (charniak et al, 2003), sentence simplifica tion (carroll et al, 1999), and a linguist?s search engine (resnik and elkiss, 2003). however, there is a key difference between these approaches and ours. such parsers typically return phrase-structure trees in the styleof the penn treebank, but without traces and co indexation. toutanova et al (2002) and ka plan et al (2004) combine statistical methods with a linguistically motivated grammar formalism (hpsg and lfg respectively) in an attempt to achieve levels of robustness and accuracy comparable to the penn treebank parsers (which kaplan et al do achieve). however, the usefulness of this outputis limited, since the underlying meaning (as repre sented in a predicate-argument structure or logical form) is difficult to reconstruct from such skeletal parse trees.in this paper we demonstrate how a widecoverage statistical parser using combinatory categorial grammar (ccg) can be used to generate semantic representations. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1197.txt b/DATASET_PACSUM/dataset/inputs/C04-1197.txt new file mode 100644 index 0000000000000000000000000000000000000000..34726fce0b34c562e83df80fc176f1e995f063c2 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1197.txt @@ -0,0 +1 @@ +semantic parsing of sentences is believed to be animportant task toward natural language understand ing, and has immediate applications in tasks such information extraction and question answering. we study semantic role labeling(srl). for each verb in a sentence, the goal is to identify all constituents that fill a semantic role, and to determine their roles,such as agent, patient or instrument, and their ad juncts, such as locative, temporal or manner. the propbank project (kingsbury and palmer, 2002) provides a large human-annotated corpus of semantic verb-argument relations. specifically, we use the data provided in the conll-2004 shared task of semantic-role labeling (carreras and ma`rquez, 2003) which consists of a portion of thepropbank corpus, allowing us to compare the per formance of our approach with other systems. previous approaches to the srl task have madeuse of a full syntactic parse of the sentence in or der to define argument boundaries and to determine the role labels (gildea and palmer, 2002; chen and rambow, 2003; gildea and hockenmaier, 2003;pradhan et al, 2003; pradhan et al, 2004; sur deanu et al, 2003). in this work, following the conll-2004 shared task definition, we assume thatthe srl system takes as input only partial syn tactic information, and no external lexico-semantic knowledge bases. specifically, we assume as input resources a part-of-speech tagger, a shallow parser that can process the input to the level of basedchunks and clauses (tjong kim sang and buch holz, 2000; tjong kim sang and de?jean, 2001), and a named-entity recognizer (tjong kim sang and de meulder, 2003). we do not assume a full parse as input. srl is a difficult task, and one cannot expecthigh levels of performance from either purely man ual classifiers or purely learned classifiers. rather, supplemental linguistic information must be used to support and correct a learning system. so far,machine learning approaches to srl have incorpo rated linguistic information only implicitly, via theclassifiers? features. the key innovation in our ap proach is the development of a principled method tocombine machine learning techniques with linguistic and structural constraints by explicitly incorpo rating inference into the decision process. in the machine learning part, the system we present here is composed of two phases. first, a set of argument candidates is produced using twolearned classifiers?one to discover beginning po sitions and one to discover end positions of each argument type. hopefully, this phase discovers a small superset of all arguments in the sentence (foreach verb). in a second learning phase, the candi date arguments from the first phase are re-scored using a classifier designed to determine argument type, given a candidate argument.unfortunately, it is difficult to utilize global prop erties of the sentence into the learning phases.however, the inference level it is possible to incorporate the fact that the set of possible rolelabelings is restricted by both structural and lin guistic constraints?for example, arguments cannotstructurally overlap, or, given a predicate, some ar gument structures are illegal. the overall decision problem must produce an outcome that consistent with these constraints. we encode the constraints aslinear inequalities, and use integer linear programming(ilp) as an inference procedure to make a final decision that is both consistent with the con straints and most likely according to the learningsystem. although ilp is generally a computationally hard problem, there are efficient implementations that can run on thousands of variables and constraints. in our experiments, we used the commer cial ilp package (xpress-mp, 2003), and were able to process roughly twenty sentences per second.in our experiments, we used the commer cial ilp package (xpress-mp, 2003), and were able to process roughly twenty sentences per second. semantic parsing of sentences is believed to be animportant task toward natural language understand ing, and has immediate applications in tasks such information extraction and question answering. we study semantic role labeling(srl). although ilp is generally a computationally hard problem, there are efficient implementations that can run on thousands of variables and constraints. as more constraints are considered, we ex pect the overall performance to improve. see the details of the definition in kingsbury and palmer (2002) and carreras and ma`rquez (2003). we show that linguistic information is useful for se mantic role labeling, both in extracting features and dist. prec. the goal of the semantic-role labeling task is to dis cover the verb-argument structure for a given input sentence. we encode the constraints aslinear inequalities, and use integer linear programming(ilp) as an inference procedure to make a final decision that is both consistent with the con straints and most likely according to the learningsystem. for each verb in a sentence, the goal is to identify all constituents that fill a semantic role, and to determine their roles,such as agent, patient or instrument, and their ad juncts, such as locative, temporal or manner. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C04-1200.txt b/DATASET_PACSUM/dataset/inputs/C04-1200.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b86e6f55da8b279a0a148177ff6ff4b3a110826 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C04-1200.txt @@ -0,0 +1 @@ +what is an opinion? the many opinions on opinions are reflected in a considerable literature (aristotle 1954; perelman 1970; toulmin et al 1979; wallace 1975; toulmin 2003). recent computational work either focuses on sentence ?subjectivity? (wiebe et al 2002; riloff et al 2003), concentrates just on explicit statements of evaluation, such as of films (turney 2002; pang et al 2002), or focuses on just one aspect of opinion, e.g., (hatzivassiloglou and mckeown 1997) on adjectives. we wish to study opinion in general; our work most closely resembles that of (yu and hatzivassiloglou 2003). since an analytic definition of opinion is probably impossible anyway, we will not summarize past discussion or try to define formally what is and what is not an opinion. for our purposes, we describe an opinion as a quadruple [topic, holder, claim, sentiment] in which the holder believes a claim about the topic, and in many cases associates a sentiment, such as good or bad, with the belief. for example, the following opinions contain claims but no sentiments: ?i believe the world is flat? ?the gap is likely to go bankrupt? ?bin laden is hiding in pakistan? ?water always flushes anti-clockwise in the southern hemisphere? like yu and hatzivassiloglou (2003), we want to automatically identify sentiments, which in this work we define as an explicit or implicit expression in text of the holder?s positive, negative, or neutral regard toward the claim about the topic. (other sentiments we plan to study later.) sentiments always involve the holder?s emotions or desires, and may be present explicitly or only implicitly: ?i think that attacking iraq would put the us in a difficult position? (implicit) ?the us attack on iraq is wrong? (explicit) ?i like ike? (explicit) ?we should decrease our dependence on oil? (implicit) ?reps. tom petri and william f. goodling asserted that counting illegal aliens violates citizens? basic right to equal representation? (implicit) in this paper we address the following challenge problem. given a topic (e.g., ?should abortion be banned??) and a set of texts about the topic, find the sentiments expressed about (claims about) the topic (but not its supporting subtopics) in each text, and identify the people who hold each sentiment. to avoid the problem of differentiating between shades of sentiments, we simplify the problem to: identify just expressions of positive, negative, or neutral sentiments, together with their holders. in addition, for sentences that do not express a sentiment but simply state that some sentiment(s) exist(s), return these sentences in a separate set. for example, given the topic ?what should be done with medicare?? the sentence ?after years of empty promises, congress has rolled out two medicare prescription plans, one from house republicans and the other from the democratic sentence pos tagger verbs nounsadjectives adjective senti ment classifier sentiment sentiment sentence sentiment classifier opinion region + polarity + holder holder finder named entity tagger sentence sentence texts + topic sentiment sentiment sentiment v rbs verb senti ment classifier nouns noun senti ment classifier wordnet sentence : figure 1: system architecture. sens. bob graham of florida and zell miller of georgia? should be returned in the separate set. we approach the problem in stages, starting with words and moving on to sentences. we take as unit sentiment carrier a single word, and first classify each adjective, verb, and noun by its sentiment. we experimented with several classifier models. but combining sentiments requires additional care, as table 1 shows. california supreme court agreed that the state?s new term-limit law was constitutional. california supreme court disagreed that the state?s new term-limit law was constitutional. california supreme court agreed that the state?s new term-limit law was unconstitutional. california supreme court disagreed that the state?s new term-limit law was unconstitutional. table 1: combining sentiments. a sentence might even express opinions of different people. when combining word-level sentiments, we therefore first determine for each holder a relevant region within the sentence and then experiment with various models for combining word sentiments. we describe our models and algorithm in section 2, system experiments and discussion in section 3, and conclude in section 4.sentiment recognition is a challenging and difficult part of understanding opinions. nonetheless, as the experiments show, encouraging results can be obtained even with relatively simple models and only a small amount of manual seeding effort. what is an opinion? the many opinions on opinions are reflected in a considerable literature (aristotle 1954; perelman 1970; toulmin et al 1979; wallace 1975; toulmin 2003). a sentence might even express opinions of different people. recent computational work either focuses on sentence ?subjectivity? we describe our models and algorithm in section 2, system experiments and discussion in section 3, and conclude in section 4. when combining word-level sentiments, we therefore first determine for each holder a relevant region within the sentence and then experiment with various models for combining word sentiments. table 1: combining sentiments. (wiebe et al 2002; riloff et al 2003), concentrates just on explicit statements of evaluation, such as of films (turney 2002; pang et al 2002), or focuses on just one aspect of opinion, e.g., (hatzivassiloglou and mckeown 1997) on adjectives. we wish to study opinion in general; our work most closely resembles that of (yu and hatzivassiloglou 2003). california supreme court disagreed that the state?s new term-limit law was unconstitutional. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C08-1018.txt b/DATASET_PACSUM/dataset/inputs/C08-1018.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bf0b153c89df74546f9229f5391a1bad3b1aff6 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C08-1018.txt @@ -0,0 +1 @@ +automatic sentence compression can be broadly described as the task of creating a grammaticalsummary of a single sentence with minimal information loss. it has recently attracted much attention, in part because of its relevance to applications. examples include the generation of sub titles from spoken transcripts (vandeghinste and pan, 2004), the display of text on small screens such as mobile phones or pdas (corston-oliver, 2001), and, notably, summarisation (jing, 2000; lin, 2003). most prior work has focused on a specific instantiation of sentence compression, namely word deletion. given an input sentence of words, w 1 , w 2 . . . w n , a compression is formed by dropping any subset of these words (knight c ? 2008. licensed under the creative commonsattribution-noncommercial-share alike 3.0 unported li cense (http://creativecommons.org/licenses/by-nc-sa/3.0/). some rights reserved. and marcu, 2002). the simplification renders the task computationally feasible, allowing efficient decoding using a dynamic program (knight andmarcu, 2002; turner and charniak, 2005; mcdon ald, 2006). furthermore, constraining the problemto word deletion affords substantial modeling flexibility. indeed, a variety of models have been successfully developed for this task ranging from in stantiations of the noisy-channel model (knight and marcu, 2002; galley and mckeown, 2007;turner and charniak, 2005), to large-margin learn ing (mcdonald, 2006; cohn and lapata, 2007), and integer linear programming (clarke, 2008). however, the simplification also renders the tasksomewhat artificial. there are many rewrite operations that could compress a sentence, besides deletion, including reordering, substitution, and inser tion. in fact, professional abstractors tend to use these operations to transform selected sentences from an article into the corresponding summary sentences (jing, 2000). therefore, in this paper we consider sentence compression from a more general perspective and generate abstracts rather than extracts. in this framework, the goal is to find a summary of theoriginal sentence which is grammatical and conveys the most important information without necessarily using the same words in the same or der. our task is related to, but different from, paraphrase extraction (barzilay, 2003). we must not only have access to paraphrases (i.e., rewrite rules), but also be able to combine them in order to generate new text, while attempting to produce a shorter resulting string. quirk et al (2004) present an end-to-end paraphrasing system inspired byphrase-based machine translation that can both ac quire paraphrases and use them to generate new strings. however, their model is limited to lexical substitution ? no reordering takes place ? and is 137 lacking the compression objective.once we move away from extractive compres sion we are faced with two problems. first, wemust find an appropriate training set for our abstractive task. compression corpora are not natu rally available and existing paraphrase corpora do not normally contain compressions. our second problem concerns the modeling task itself. ideally, our learning framework should handle structural mismatches and complex rewriting operations.in what follows, we first present a new cor pus for abstractive compression which we created by having annotators compress sentences while rewriting them. besides obtaining useful data formodeling purposes, we also demonstrate that ab stractive compression is a meaningful task. we then present a tree-to-tree transducer capable of transforming an input parse tree into a compressed parse tree. our approach is based on synchronous tree substitution grammar (stsg, eisner (2003)),a formalism that can account for structural mismatches, and is trained discriminatively. specifi cally, we generalise the model of cohn and lapata (2007) to our abstractive task. we present a noveltree-to-tree grammar extraction method which acquires paraphrases from bilingual corpora and ensure coherent output by including a ngram language model as a feature. we also develop a number of loss functions suited to the abstractive compression task. we hope that some of the work described here might be of relevance to other gen eration tasks such as machine translation (eisner, 2003), multi-document summarisation (barzilay, 2003), and text simplification (carroll et al, 1999).our results are summarised in table 4, where we show the mean ratings for our system (abstract), the baseline (extract), and the gold standard. special thanks to phil blunsom, james clarke and miles osborne for their insightful suggestions. automatic sentence compression can be broadly described as the task of creating a grammaticalsummary of a single sentence with minimal information loss. we first performed an analysis of variance (anova)to examine the effect of different system compres sions. acknowledgements the authors acknowledge the support of epsrc (grants gr/t04540/01 and gr/t04557/01). we also develop a number of loss functions suited to the abstractive compression task. the anova revealed a reliable effect on both grammaticality and importance (significant over both subjects and items (p < 0.01)).we next examined in more detail between system differences. it has recently attracted much attention, in part because of its relevance to applications. we hope that some of the work described here might be of relevance to other gen eration tasks such as machine translation (eisner, 2003), multi-document summarisation (barzilay, 2003), and text simplification (carroll et al, 1999). finally, we planto apply the model to other paraphrasing tasks in cluding fully abstractive document summarisation (daum?e iii and marcu, 2002). \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C08-1022.txt b/DATASET_PACSUM/dataset/inputs/C08-1022.txt new file mode 100644 index 0000000000000000000000000000000000000000..11a0ff9143f1792cea30bd4df59f99795437a316 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C08-1022.txt @@ -0,0 +1 @@ +the field of research in natural language processing (nlp) applications for l2 language is constantly growing. this is largely driven by the ex panding population of l2 english speakers, whose varying levels of ability may require different types of nlp tools from those designed primarily for native speakers of the language. these include applications for use by the individual and within instructional contexts. among the key tools are error-checking applications, focusing particularly on areas which learners find the most challenging. prepositions and determiners are known to be oneof the most frequent sources of error for l2 en glish speakers, a finding supported by our analysisof a small error-tagged corpus we created (determiners 17% of errors, prepositions 12%). there fore, in developing a system for automatic error detection in l2 writing, it seems desirable to focus on these problematic, and very common, parts of speech (pos).this paper gives a brief overview of the prob lems posed by these pos and of related work. we c ? 2008. licensed under the creative commonsattribution-noncommercial-share alike 3.0 unported li cense (http://creativecommons.org/licenses/by-nc-sa/3.0/). some rights reserved. then present our proposed approach on both l1 and l2 data and discuss the results obtained so far.rachele de felice was supported by an ahrc scholar ship for the duration of her studies. the field of research in natural language processing (nlp) applications for l2 language is constantly growing. this paper discussed a contextual feature based approach to the automatic acquisition of models of use for prepositions and determiners, whichachieve an accuracy of 70.06% and 92.15% re spectively, and showed how it can be applied to anerror correction task for l2 writing, with promis ing early results. in developing this model, our first aim was not to create something which learns like a human, butsomething that works in the best and most effi cient possible way. prepositions are challenging for learners because they can appear to have an idiosyncratic behaviour which does not follow any predictable pattern even across nearly identical contexts. however, in noting both divergences and similarities between the two learners, human and machine, we may be able to derive useful insights into the way the learning processes operate, and what factors could be more or less important for them. then present our proposed approach on both l1 and l2 data and discuss the results obtained so far. therefore, here, too, it is very hard tocome up with clear-cut rules predicting every pos sible kind of occurrence. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C08-1098.txt b/DATASET_PACSUM/dataset/inputs/C08-1098.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa15b814f84759f1f93ed6783d223c32f92e733d --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C08-1098.txt @@ -0,0 +1 @@ +a hidden-markov-model part-of-speech tagger (brants, 2000, e.g.) computes the most probable pos tag sequence ? t n 1 = ? t 1 , ..., ? t n for a given word sequence w n 1 . ? t n 1 = argmax t n 1 p(t n 1 , w n 1 )the joint probability of the two sequences is de fined as the product of context probabilities and lexical probabilities over all pos tags: p(t n 1 , w n 1 ) = n ? i=1 p(t i |t i?1 i?k ) ? ?? context prob. p(w i |t i ) ? ?? lexical prob. (1)hmm taggers are fast and were successfully applied to a wide range of languages and training cor pora. c ? 2008. licensed under the creative commonsattribution-noncommercial-share alike 3.0 unported li cense (http://creativecommons.org/licenses/by-nc-sa/3.0/). some rights reserved. pos taggers are usually trained on corpora with between 50 and 150 different pos tags. tagsets of this size contain little or no information aboutnumber, gender, case and similar morphosyntactic features. for languages with a rich morphol ogy such as german or czech, more fine-grained tagsets are often considered more appropriate. theadditional information may also help to disam biguate the (base) part of speech. without gender information, for instance, it is difficult for a tagger to correctly disambiguate the german sentence ist das realit?at? (is that reality?). the word das is ambiguous between an article and a demonstrative. because of the lack of gender agreement between das (neuter) and the noun realit?at (feminine), the article reading must be wrong. the german tiger treebank (brants et al, 2002) is an example of a corpus with a more fine-grainedtagset (over 700 tags overall). large tagsets aggra vate sparse data problems. as an example, take the german sentence das zu versteuernde einkommen sinkt (?the to be taxed income decreases?; the taxable income decreases). this sentence should be tagged as shown in table 1. das art.def.nom.sg.neut zu part.zu versteuernde adja.pos.nom.sg.neut einkommen n.reg.nom.sg.neut sinkt vfin.full.3.sg.pres.ind . sym.pun.sent. table 1: correct pos tags for the german sentence das zu versteuernde einkommen sinkt. unfortunately, the pos trigram consisting of the tags of the first three words does not occurin the tiger corpus. (neither does the pair con sisting of the first two tags.) the unsmoothed 777context probability of the third pos tag is there fore 0. if the probability is smoothed with the backoff distribution p(?|part.zu), the most probable tag is adja.pos.acc.sg.fem rather thanadja.pos.nom.sg.neut. thus, the agreement be tween the article and the adjective is not checked anymore. a closer inspection of the tiger corpus reveals that it actually contains all the information needed to completely disambiguate each component of the pos tag adja.pos.nom.sg.neut: ? all words appearing after an article (art)and the infinitive particle zu (part.zu) are at tributive adjectives (adja) (10 of 10 cases). all adjectives appearing after an article and a particle (part) have the degree positive (pos) (39 of 39 cases). all adjectives appearing after a nominative article and a particle have nominative case (11 of 11 cases).? all adjectives appearing after a singular arti cle and a particle are singular (32 of 32 cases). all adjectives appearing after a neuter article and a particle are neuter (4 of 4 cases). by (1) decomposing the context probability of adja.pos.nom.sg.neut into a product of attribute probabilities p(adja | 2:art, 2:art.def, 2:art.nom, 2:art.sg, 2:art.neut, 1:part, 1:part.zu) ? p(pos| 2:art, 2:art.def, 2:art.nom, 2:art.sg, 2:art.neut, 1:part, 1:part.zu, 0:adja) ? p(nom | 2:art, 2:art.def, 2:art.nom, 2:art.sg, 2:art.neut, 1:part, 1:part.zu, 0:adja, 0:adja.pos) ? p(sg | 2:art, 2:art.def, 2:art.nom, 2:art.sg, 2:art.neut, 1:part, 1:part.zu, 0:adja, 0:adja.pos, 0:adja.nom) ? p(neut | 2:art, 2:art.def, 2:art.nom, 2:art.sg, 2:art.neut, 1:part, 1:part.zu, 0:adja, 0:adja.pos, 0:adja.nom, 0:adja.sg) and (2) selecting the relevant context attributes for the prediction of each attribute, we obtain the following expression for the context probability: p(adja | art, part.zu) ? p(pos | 2:art, 1:part, 0:adja) ? p(nom | 2:art.nom, 1:part.zu, 0:adja) ? p(sg | 2:art.sg, 1:part.zu, 0:adja) ? p(neut | 2:art.neut, 1:part.zu, 0:adja) the conditional probability of each attribute is 1. hence the context probability of the whole tag is. also 1. without having observed the given context, it is possible to deduce that the observed pos tag is the only possible tag in this context. these considerations motivate an hmm tagging approach which decomposes the pos tags into a set of simple attributes, and uses decision trees toestimate the probability of each attribute. decision trees are ideal for this task because the iden tification of relevant attribute combinations is at the heart of this method. the backoff smoothing methods of traditional n-gram pos taggers require an ordering of the reduced contexts which is not available, here. discriminatively trained taggers, on the other hand, have difficulties to handle the huge number of features which are active at the same time if any possible combination of context attributes defines a separate feature.we presented a hmm pos tagger for fine-grained tagsets which splits the pos tags into attributevectors and estimates the conditional probabilities of the attributes with decision trees. discriminatively trained taggers, on the other hand, have difficulties to handle the huge number of features which are active at the same time if any possible combination of context attributes defines a separate feature. in ex periments with german and czech corpora, this method achieved a higher tagging accuracy than two state-of-the-art general-purpose pos taggers (tnt and svmtool). a hidden-markov-model part-of-speech tagger (brants, 2000, e.g.) computes the most probable pos tag sequence ? t n 1 = ? t 1 , ..., ? t n for a given word sequence w n 1 . ? t n 1 = argmax t n 1 p(t n 1 , w n 1 )the joint probability of the two sequences is de fined as the product of context probabilities and lexical probabilities over all pos tags: p(t n 1 , w n 1 ) = n ? i=1 p(t i |t i?1 i?k ) ? ?? context prob. the backoff smoothing methods of traditional n-gram pos taggers require an ordering of the reduced contexts which is not available, here. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C08-1107.txt b/DATASET_PACSUM/dataset/inputs/C08-1107.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e47dd2d5e83a5616830ec7aa41869c6ebe08b8b --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C08-1107.txt @@ -0,0 +1 @@ +in many nlp applications, such as question an swering (qa) and information extraction (ie), it is crucial to recognize whether a specific target meaning is inferred from a text. for example, a qa system has to deduce that ?sco sued ibm? is inferred from ?sco won a lawsuit against ibm? to answer ?whom did sco sue??. this type of reasoning has been identified as a core semanticinference paradigm by the generic textual entail ment framework (giampiccolo et al, 2007). an important type of knowledge needed for such inference is entailment rules. an entailmentrule specifies a directional inference relation be tween two templates, text patterns with variables, such as ?x win lawsuit against y ? x sue y ?. applying this rule by matching ?x win lawsuit against y ? in the above text allows a qa system to c ? 2008. licensed under the creative commonsattribution-noncommercial-share alike 3.0 unported li cense (http://creativecommons.org/licenses/by-nc-sa/3.0/). some rights reserved.infer ?x sue y ? and identify ?ibm?, y ?s instantiation, as the answer for the above question. entail ment rules capture linguistic and world-knowledge inferences and are used as an important building block within different applications, e.g. (romano et al, 2006). one reason for the limited performance of generic semantic inference systems is the lack of broad-scale knowledge-bases of entailment rules (in analog to lexical resources such as wordnet). supervised learning of broad coverage rule-sets is an arduous task. this sparked intensive research on unsupervised acquisition of entailment rules (and similarly paraphrases) e.g. (lin and pantel, 2001; szpektor et al, 2004; sekine, 2005). most unsupervised entailment rule acquisitionmethods learn binary rules, rules between tem plates with two variables, ignoring unary rules, rules between unary templates (templates withonly one variable). however, a predicate quite of ten appears in the text with just a single variable(e.g. intransitive verbs or passives), where infer ence requires unary rules, e.g. ?x take a nap?x sleep? (further motivations in section 3.1).in this paper we focus on unsupervised learning of unary entailment rules. two learning ap proaches are proposed. in our main approach, rules are learned by measuring how similar the variable instantiations of two templates in a corpusare. in addition to adapting state-of-the-art similar ity measures for unary rule learning, we propose a new measure, termed balanced-inclusion, which balances the notion of directionality in entailment with the common notion of symmetric semantic similarity. in a second approach, unary rules arederived from binary rules learned by state-of-the art binary rule learning methods. we tested the various unsupervised unary rule 849learning methods, as well as a binary rule learn ing method, on a test set derived from a standard ie benchmark. this provides the first comparisonbetween the performance of unary and binary rule sets. several results rise from our evaluation: (a) while most work on unsupervised learning ignored unary rules, all tested unary methods outperformed the binary method; (b) it is better to learn unary rules directly than to derive them from a binary rule-base; (c) our proposed balanced-inclusion measure outperformed all other tested methods interms of f1 measure. moreover, only balancedinclusion improved f1 score over a baseline infer ence that does not use entailment rules at all .we presented two approaches for unsupervised ac quisition of unary entailment rules from regular (non-comparable) corpora. in many nlp applications, such as question an swering (qa) and information extraction (ie), it is crucial to recognize whether a specific target meaning is inferred from a text. moreover, only balancedinclusion improved f1 score over a baseline infer ence that does not use entailment rules at all . for example, a qa system has to deduce that ?sco sued ibm? acknowledgements this work was partially supported by isf grant 1095/05, the ist programme of the europeancommunity under the pascal network of ex cellence ist-2002-506778 and the negev project (www.negev-initiative.org). several results rise from our evaluation: (a) while most work on unsupervised learning ignored unary rules, all tested unary methods outperformed the binary method; (b) it is better to learn unary rules directly than to derive them from a binary rule-base; (c) our proposed balanced-inclusion measure outperformed all other tested methods interms of f1 measure. this provides the first comparisonbetween the performance of unary and binary rule sets. we implemented the unary rule learning algo rithms described in section 3 and the binary dirt algorithm (lin and pantel, 2001). is inferred from ?sco won a lawsuit against ibm? \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C08-1109.txt b/DATASET_PACSUM/dataset/inputs/C08-1109.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c214faa357902221e5084e913a7590130850825 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C08-1109.txt @@ -0,0 +1 @@ +the long-term goal of our work is to develop asystem which detects errors in grammar and us age so that appropriate feedback can be given to non-native english writers, a large and growing segment of the world?s population. estimates arethat in china alone as many as 300 million people are currently studying english as a second lan guage (esl). usage errors involving prepositions are among the most common types seen in thewriting of non-native english speakers. for ex ample, (izumi et al, 2003) reported error rates for english prepositions that were as high as 10% ina japanese learner corpus. errors can involve incorrect selection (?we arrived to the station?), ex traneous use (?he went to outside?), and omission (?we are fond null beer?). what is responsiblefor making preposition usage so difficult for non native speakers? c ? 2008. licensed under the creative commonsattribution-noncommercial-share alike 3.0 unported li cense (http://creativecommons.org/licenses/by-nc-sa/3.0/). some rights reserved. at least part of the difficulty seems to be due tothe great variety of linguistic functions that prepositions serve. when a preposition marks the argument of a predicate, such as a verb, an adjective, or a noun, preposition selection is con strained by the argument role that it marks, thenoun which fills that role, and the particular predi cate. many english verbs also display alternations (levin, 1993) in which an argument is sometimes marked by a preposition and sometimes not (e.g., ?they loaded the wagon with hay? / ?they loaded hay on the wagon?). when prepositions introduceadjuncts, such as those of time or manner, selec tion is constrained by the object of the preposition (?at length?, ?in time?, ?with haste?). finally, the selection of a preposition for a given context also depends upon the intended meaning of the writer (?we sat at the beach?, ?on the beach?, ?near the beach?, ?by the beach?). with so many sources of variation in englishpreposition usage, we wondered if the task of se lecting a preposition for a given context might prove challenging even for native speakers. to investigate this possibility, we randomly selected200 sentences from microsoft?s encarta encyclopedia, and, in each sentence, we replaced a ran domly selected preposition with a blank line. we then asked two native english speakers to perform a cloze task by filling in the blank with the best preposition, given the context provided by the rest of the sentence. our results showed only about75% agreement between the two raters, and be tween each of our raters and encarta.the presence of so much variability in prepo sition function and usage makes the task of thelearner a daunting one. it also poses special chal lenges for developing and evaluating an nlp error detection system. this paper addresses both the 865 development and evaluation of such a system. first, we describe a machine learning system that detects preposition errors in essays of esl writers. to date there have been relatively few attempts to address preposition error detection,though the sister task of detecting determiner errors has been the focus of more research. our system performs comparably with other leading sys tems. we extend our previous work (chodorow etal., 2007) by experimenting with combination fea tures, as well as features derived from the google n-gram corpus and comlex (grishman et al, 1994).second, we discuss drawbacks in current meth ods of annotating esl data and evaluating errordetection systems, which are not limited to prepo sition errors. while the need for annotation by multiple raters has been well established in nlp tasks (carletta, 1996), most previous work in error detection has surprisingly relied on only one raterto either create an annotated corpus of learner errors, or to check the system?s output. some grammatical errors, such as number disagreement be tween subject and verb, no doubt show very highreliability, but others, such as usage errors involv ing prepositions or determiners are likely to be much less reliable. our results show that relyingon one rater for system evaluation can be problem atic, and we provide a sampling approach which can facilitate using multiple raters for this task. in the next section, we describe a system that automatically detects errors involving incorrect preposition selection (?we arrived to the station?) and extraneous preposition usage (?he went to outside?). in sections 3 and 4, we discuss theproblem of relying on only one rater for exhaus tive annotation and show how multiple raters can be used more efficiently with a sampling approach.finally, in section 5 we present an analysis of com mon preposition errors that non-native speakers make.we wouldalso like to acknowledge the three anonymous reviewers and derrick higgins for their helpful com ments and feedback. the long-term goal of our work is to develop asystem which detects errors in grammar and us age so that appropriate feedback can be given to non-native english writers, a large and growing segment of the world?s population. this paper has two contributions to the field of error detection in non-native writing. estimates arethat in china alone as many as 300 million people are currently studying english as a second lan guage (esl). and extraneous preposition usage (?he went to outside?). in sections 3 and 4, we discuss theproblem of relying on only one rater for exhaus tive annotation and show how multiple raters can be used more efficiently with a sampling approach.finally, in section 5 we present an analysis of com mon preposition errors that non-native speakers make. in the next section, we describe a system that automatically detects errors involving incorrect preposition selection (?we arrived to the station?) usage errors involving prepositions are among the most common types seen in thewriting of non-native english speakers. one aspect of automatic error detection that usu ally is under-reported is an analysis of the errors that learners typically make. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C08-1114.txt b/DATASET_PACSUM/dataset/inputs/C08-1114.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f493ce20a68f10db25d6b5e6f32e5078194dbee --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C08-1114.txt @@ -0,0 +1 @@ +a pair of words (petrify:stone) is analogous to another pair (vaporize:gas) when the semantic re lations between the words in the first pair are highly similar to the relations in the second pair. two words (levied and imposed) are synonymousin a context (levied a tax) when they can be interchanged (imposed a tax), they are are antony mous when they have opposite meanings (black c ? 2008, national research council of canada (nrc).licensed to the coling 2008 organizing committee for pub lication in coling 2008 and for re-publishing in any form or medium. and white), and they are associated when they tend to co-occur (doctor and hospital).on the surface, it appears that these are four distinct semantic classes, requiring distinct nlp al gorithms, but we propose a uniform approach to all four. we subsume synonyms, antonyms, and associations under analogies. in essence, we say that x and y are antonyms when the pair x:y is analogous to the pair black:white, x and y are synonyms when they are analogous to the pair levied:imposed, and x and y are associated when they are analogous to the pair doctor:hospital. there is past work on recognizing analogies(reitman, 1965), synonyms (landauer and dumais, 1997), antonyms (lin et al, 2003), and asso ciations (lesk, 1969), but each of these four tasks has been examined separately, in isolation from the others. as far as we know, the algorithm proposed here is the first attempt to deal with all four tasks using a uniform approach. we believe that it isimportant to seek nlp algorithms that can han dle a broad range of semantic phenomena, becausedeveloping a specialized algorithm for each phe nomenon is a very inefficient research strategy.it might seem that a lexicon, such as word net (fellbaum, 1998), contains all the information we need to handle these four tasks. however, weprefer to take a corpus-based approach to seman tics. veale (2004) used wordnet to answer 374 multiple-choice sat analogy questions, achievingan accuracy of 43%, but the best corpus-based ap proach attains an accuracy of 56% (turney, 2006). another reason to prefer a corpus-based approachto a lexicon-based approach is that the former re quires less human labour, and thus it is easier to extend to other languages.in section 2, we describe our algorithm for rec ognizing analogies. we use a standard supervised 905 machine learning approach, with feature vectorsbased on the frequencies of patterns in a large cor pus. we use a support vector machine (svm) to learn how to classify the feature vectors (platt, 1998; witten and frank, 1999). section 3 presents four sets of experiments. we apply our algorithm for recognizing analogies to multiple-choice analogy questions from the sat college entrance test, multiple-choice synonym questions from the toefl (test of english as aforeign language), esl (english as a second language) practice questions for distinguishing syn onyms and antonyms, and a set of word pairs thatare labeled similar, associated, and both, devel oped for experiments in cognitive psychology.we discuss the results of the experiments in section 4. the accuracy of the algorithm is competitive with other systems, but the strength of the al gorithm is that it is able to handle all four tasks, with no tuning of the learning parameters to the particular task. it performs well, although it iscompeting against specialized algorithms, devel oped for single tasks.related work is examined in section 5 and limitations and future work are considered in sec tion 6. we conclude in section 7.in this paper, we have described a uniform approach to analogies, synonyms, antonyms, and as sociations, in which all of these phenomena are subsumed by analogies. a pair of words (petrify:stone) is analogous to another pair (vaporize:gas) when the semantic re lations between the words in the first pair are highly similar to the relations in the second pair. acknowledgementsthanks to joel martin and the anonymous review ers of coling 2008 for their helpful comments. we conclude in section 7. the main limitation of pairclass is the need for a large corpus. other potential applications in clude any task that involves semantic relations, such as word sense disambiguation, informationretrieval, information extraction, and metaphor in terpretation. this paper is a small step towards that goal. we view the problem ofrecognizing analogies as the classification of se mantic relations between words. it performs well, although it iscompeting against specialized algorithms, devel oped for single tasks.related work is examined in section 5 and limitations and future work are considered in sec tion 6. some work is required to fit each probleminto the general framework of pairclass (supervised classification of word pairs) but the core al gorithm is the same in each case. two words (levied and imposed) are synonymousin a context (levied a tax) when they can be interchanged (imposed a tax), they are are antony mous when they have opposite meanings (black c ? 2008, national research council of canada (nrc).licensed to the coling 2008 organizing committee for pub lication in coling 2008 and for re-publishing in any form or medium. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C10-1011.txt b/DATASET_PACSUM/dataset/inputs/C10-1011.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d0d28eca19766ea4ff4abcf2e50f09c88ccfb82 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C10-1011.txt @@ -0,0 +1 @@ +highly accurate dependency parsers have high de mands on resources and long parsing times. the training of a parser frequently takes several days and the parsing of a sentence can take on averageup to a minute. the parsing time usage is impor tant for many applications. for instance, dialog systems only have a few hundred milliseconds toanalyze a sentence and machine translation sys tems, have to consider in that time some thousandtranslation alternatives for the translation of a sen tence. parsing and training times can be improved by methods that maintain the accuracy level, or methods that trade accuracy against better parsing times. software developers and researchers areusually unwilling to reduce the quality of their ap plications. consequently, we have to consider atfirst methods to improve a parser, which do not in volve an accuracy loss, such as faster algorithms,faster implementation of algorithms, parallel al gorithms that use several cpu cores, and feature selection that eliminates the features that do not improve accuracy. we employ, as a basis for our parser, the secondorder maximum spanning tree dependency pars ing algorithm of carreras (2007). this algorithmfrequently reaches very good, or even the best la beled attachment scores, and was one of the most used parsing algorithms in the shared task 2009 of the conference on natural language learning (conll) (hajic? et al, 2009). we combined thisparsing algorithm with the passive-aggressive perceptron algorithm (crammer et al, 2003; mcdon ald et al, 2005; crammer et al, 2006). a parser build out of these two algorithms provides a good baseline and starting point to improve upon the parsing and training times. the rest of the paper is structured as follows. in section 2, we describe related work. in section 3, we analyze the time usage of the components of 89the parser. in section 4, we introduce a new kernel that resolves some of the bottlenecks and im proves the performance. in section 5, we describethe parallel parsing algorithms which nearly allowed us to divide the parsing times by the number of cores. in section 6, we determine the opti mal setting for the non-projective approximationalgorithm. in section 7, we conclude with a sum mary and an outline of further research.4 4we provide the parser and hash kernel as open source for download from http://code.google.com/p/mate-tools. highly accurate dependency parsers have high de mands on resources and long parsing times. we have developed a very fast parser with ex cellent attachment scores. in section 7, we conclude with a sum mary and an outline of further research. for the languages of the 2009 conll shared task, the parser could reach higher accuracy scores on average than the top performing systems. we are convinced thatthe hash kernel can be applied successful to tran sition based dependency parsers, phrase structure parsers and many other nlp applications. in section 6, we determine the opti mal setting for the non-projective approximationalgorithm. the training of a parser frequently takes several days and the parsing of a sentence can take on averageup to a minute. johansson and nugues (2008) reported training times of 2.4 days for english with the high-order parsing algorithm of carreras (2007). thresholdfor non-projective parsing, we use the nonprojective approximation algorithm of mcdon ald and pereira (2006). the scores for catalan, chinese and japanese are still lower than the top scores. in a pilot experiment, we have shown that it is possible to reduce the parsing time in this way to as little as 9 milliseconds. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C10-1152.txt b/DATASET_PACSUM/dataset/inputs/C10-1152.txt new file mode 100644 index 0000000000000000000000000000000000000000..80866a229b62fe57fe33a4fd98a20b3fd8cab26c --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C10-1152.txt @@ -0,0 +1 @@ +sentence simplification transforms long and dif ficult sentences into shorter and more readable ones. this helps humans read texts more easilyand faster. reading assistance is thus an important application of sentence simplification, espe cially for people with reading disabilities (carrollet al, 1999; inui et al, 2003), low-literacy read ers (watanabe et al, 2009), or non-native speakers (siddharthan, 2002).not only human readers but also nlp applications can benefit from sentence simplification. the original motivation for sentence simplification is using it as a preprocessor to facili tate parsing or translation tasks (chandrasekar et al., 1996). complex sentences are considered as stumbling blocks for such systems. more recently,sentence simplification has also been shown help ful for summarization (knight and marcu, 2000), ? this work has been supported by the emmy noether program of the german research foundation (dfg) underthe grant no. gu 798/3-1, and by the volkswagen founda tion as part of the lichtenberg-professorship program under the grant no. i/82806.sentence fusion (filippova and strube, 2008b), se mantic role labeling (vickrey and koller, 2008), question generation (heilman and smith, 2009), paraphrase generation (zhao et al, 2009) and biomedical information extraction (jonnalagadda and gonzalez, 2009).at sentence level, reading difficulty stems either from lexical or syntactic complexity. sen tence simplification can therefore be classifiedinto two types: lexical simplification and syntac tic simplification (carroll et al, 1999). these two types of simplification can be further implemented by a set of simplification operations. splitting, dropping, reordering, and substitution are widely accepted as important simplification operations. the splitting operation splits a long sentence intoseveral shorter sentences to decrease the complex ity of the long sentence. the dropping operation further removes unimportant parts of a sentence to make it more concise. the reordering operationinterchanges the order of the split sentences (sid dharthan, 2006) or parts in a sentence (watanabeet al, 2009). finally, the substitution operation re places difficult phrases or words with their simpler synonyms.in most cases, different simplification operations happen simultaneously. it is therefore nec essary to consider the simplification process as a combination of different operations and treatthem as a whole. however, most of the existing models only consider one of these operations. siddharthan (2006) and petersen and ostendorf (2007) focus on sentence splitting, while sen tence compression systems (filippova and strube, 2008a) mainly use the dropping operation. as faras lexical simplification is concerned, word substitution is usually done by selecting simpler syn onyms from wordnet based on word frequency (carroll et al, 1999).in this paper, we propose a sentence simplifica tion model by tree transformation which is based 1353 on techniques from statistical machine translation (smt) (yamada and knight, 2001; yamada andknight, 2002; graehl et al, 2008). our model in tegrally covers splitting, dropping, reordering and phrase/word substitution. the parameters of ourmodel can be efficiently learned from complex simple parallel datasets. the transformation froma complex sentence to a simple sentence is con ducted by applying a sequence of simplification operations. an expectation maximization (em) algorithm is used to iteratively train our model. we also propose a method based on monolingualword mapping which speeds up the training pro cess significantly. finally, a decoder is designed to generate the simplified sentences using a greedy strategy and integrates language models.in order to train our model, we further com pile a large-scale complex-simple parallel dataset(pwkp) from simple english wikipedia1 and en glish wikipedia2, as such datasets are rare.we organize the remainder of the paper as follows: section 2 describes the pwkp dataset. sec tion 3 presents our tsm model. sections 4 and 5 are devoted to training and decoding, respectively. section 6 details the evaluation. the conclusions follow in the final section.in this paper, we presented a novel large-scale par allel dataset pwkp for sentence simplification. the evaluation shows that tsm can achieve better overall readability scores than a set of baseline systems. the conclusions follow in the final section. sentence simplification transforms long and dif ficult sentences into shorter and more readable ones. this helps humans read texts more easilyand faster. section 6 details the evaluation. in the future, we will investigate more sophisticated features and rules to enhance tsm. our evaluation dataset consists of 100 complex sentences and 131 parallel simple sentences from pwkp. we collected a paired dataset from the english wikipedia and simple english wikipedia. as the dependency. the first is moses which is a state of the art smt system widely used as a baseline in mt community. sections 4 and 5 are devoted to training and decoding, respectively. we first per form 1 to 1 mapping with sentence-level tf*idf and then combine the pairs with the same complex sentence and adjacent simple sentences. but the parser returns ?su perset? obviously, the purpose of mosesis cross-lingual translation rather than monolin 1358 gual simplification. should be a dependency of ?called?. they have not been used for training.four baseline systems are compared in our eval uation. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C10-2005.txt b/DATASET_PACSUM/dataset/inputs/C10-2005.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a6966e24771095f62485c7339a9edc293f0c4a --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C10-2005.txt @@ -0,0 +1 @@ +twitter is one of the most popular social network websites and has been growing at a very fast pace. the number of twitter users reached an estimated75 million by the end of 2009, up from approx imately 5 million in the previous year. through the twitter platform, users share either informationor opinions about personalities, politicians, prod ucts, companies, events (prentice and huffman, 2008) etc. this has been attracting the attention of different communities interested in analyzing its content. sentiment detection of tweets is one of the basicanalysis utility functions needed by various applications over twitter data. many systems and ap proaches have been implemented to automatically detect sentiment on texts (e.g., news articles, web reviews and web blogs) (pang et al, 2002; pang and lee, 2004; wiebe and riloff, 2005; glance et al, 2005; wilson et al, 2005). most of theseapproaches use the raw word representation (n grams) as features to build a model for sentiment detection and perform this task over large pieces of texts. however, the main limitation of usingthese techniques for the twitter context is mes sages posted on twitter, so-called tweets, are veryshort. the maximum size of a tweet is 140 char acters. in this paper, we propose a 2-step sentiment analysis classification method for twitter, whichfirst classifies messages as subjective and ob jective, and further distinguishes the subjectivetweets as positive or negative. to reduce the la beling effort in creating these classifiers, instead of using manually annotated data to compose thetraining data, as regular supervised learning ap proaches, we leverage sources of noisy labels asour training data. these noisy labels were pro vided by a few sentiment detection websites over twitter data. to better utilize these sources, we verify the potential value of using and combining them, providing an analysis of the provided labels, examine different strategies of combining these sources in order to obtain the best outcome; and, propose a more robust feature set that captures a more abstract representation of tweets, composedby meta-information associated to words and spe cific characteristics of how tweets are written. by using it, we aim to handle better: the problem of lack of information on tweets, helping on thegeneralization process of the classification algo rithms; and the noisy and biased labels provided by those websites.the remainder of this paper is organized as fol lows. in section 2, we provide some context about messages on twitter and about the websites used as label sources. we introduce the features used in the sentiment detection and also provide a deep analysis of the labels generated by those sources in section 3. we examine different strategies of 36 combining these sources and present an extensive experimental evaluation in section 4. finally, we discuss previous works related to ours in section 5and conclude in section 6, where we outline direc tions and future work.twitter is one of the most popular social network websites and has been growing at a very fast pace. we have presented an effective and robust sen timent detection approach for twitter messages, which uses biased and noisy labels as input to build its models. as future work, we want to perform a more fine grained analysis of sentences in order to identifyits main focus and then based the sentiment clas sification on it. finally, we discuss previous works related to ours in section 5and conclude in section 6, where we outline direc tions and future work. we examine different strategies of 36 combining these sources and present an extensive experimental evaluation in section 4. the number of twitter users reached an estimated75 million by the end of 2009, up from approx imately 5 million in the previous year. in this section, we give some context about twitter messages and the sources used for our data-driven approach. http://bit.ly/9k4n9p #obama figure 1: example of a tweet. we showed in section 4 that our approach works better than theirs for this problem, obtaining lower error rates. rt @twuser: obama is the first u.s. president not to have seen a new state added in his lifetime. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C10-2028.txt b/DATASET_PACSUM/dataset/inputs/C10-2028.txt new file mode 100644 index 0000000000000000000000000000000000000000..a282ac832163b1edcf57ab9a2a1bee7f39350492 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C10-2028.txt @@ -0,0 +1 @@ +a huge amount of social media including news,forums, product reviews and blogs contain nu merous sentiment-based sentences. sentiment is defined as ?a personal belief or judgment that ?* both authors equally contributed to this paper.is not founded on proof or certainty?1. senti ment expressions may describe the mood of thewriter (happy/sad/bored/grateful/...) or the opin ion of the writer towards some specific entity (x is great/i hate x, etc.). automated identification of diverse sentimenttypes can be beneficial for many nlp systems such as review summarization systems, dia logue systems and public media analysis systems. sometimes it is directly requested by the user toobtain articles or sentences with a certain senti ment value (e.g give me all positive reviews of product x/ show me articles which explain why movie x is boring). in some other cases obtaining sentiment value can greatly enhance information extraction tasks like review summarization. whilethe majority of existing sentiment extraction sys tems focus on polarity identification (e.g., positive vs. negative reviews) or extraction of a handful of pre-specified mood labels, there are many useful and relatively unexplored sentiment types. sentiment extraction systems usually require an extensive set of manually supplied sentiment words or a handcrafted sentiment-specific dataset. with the recent popularity of article tagging, some social media types like blogs allow users to add sentiment tags to articles. this allows to use blogsas a large user-labeled dataset for sentiment learning and identification. however, the set of sentiment tags in most blog platforms is somewhat re stricted. moreover, the assigned tag applies to the whole blog post while a finer grained sentiment extraction is needed (mcdonald et al, 2007).with the recent popularity of the twitter micro blogging service, a huge amount of frequently 1wordnet 2.1 definitions. 241self-standing short textual sentences (tweets) became openly available for the research community. many of these tweets contain a wide vari ety of user-defined hashtags. some of these tagsare sentiment tags which assign one or more senti ment values to a tweet. in this paper we propose away to utilize such tagged twitter data for classi fication of a wide variety of sentiment types from text. we utilize 50 twitter tags and 15 smileys assentiment labels which allow us to build a classifier for dozens of sentiment types for short tex tual sentences. in our study we use four different feature types (punctuation, words, n-grams and patterns) for sentiment classification and evaluate the contribution of each feature type for this task.we show that our framework successfully identi fies sentiment types of the untagged tweets. we confirm the quality of our algorithm using human judges. we also explore the dependencies and overlap between different sentiment types represented by smileys and twitter tags. section 2 describes related work. section 3 details classification features and the algorithm, while section 4 describes the dataset and labels. automated and manual evaluation protocols and results are presented in section 5, followed by a short discussion.automated and manual evaluation protocols and results are presented in section 5, followed by a short discussion. a huge amount of social media including news,forums, product reviews and blogs contain nu merous sentiment-based sentences. section 3 details classification features and the algorithm, while section 4 describes the dataset and labels. while hashtag labels arespecific to twitter data, the obtained feature vectors are not heavily twitter-specific and in the fu ture we would like to explore the applicability oftwitter data for sentiment multi-class identifica tion and classification in other domains. sentiment is defined as ?a personal belief or judgment that ?* both authors equally contributed to this paper.is not founded on proof or certainty?1. we presented a framework which allows an au tomatic identification and classification of various sentiment types in short text fragments which isbased on twitter data. senti ment expressions may describe the mood of thewriter (happy/sad/bored/grateful/...) the purpose of our evaluation was to learn how well our framework can identify and distinguishbetween sentiment types defined by tags or smileys and to test if our framework can be successfully used to identify sentiment types in new un tagged sentences. section 2 describes related work. 5.1 evaluation using cross-validation. sentiment analysis tasks typically combine twodifferent tasks: (1) identifying sentiment expres sions, and (2) determining the polarity (sometimes called valence) of the expressed sentiment. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C86-1016.txt b/DATASET_PACSUM/dataset/inputs/C86-1016.txt new file mode 100644 index 0000000000000000000000000000000000000000..c78444f30cf368b65b7f070d7f22ab2d729e6a79 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C86-1016.txt @@ -0,0 +1 @@ +at the other end of the range covered by d-patr are unification-based categorial grammars (klein, steedman, uszkoreit, wittenburg) in which all the syntactic information is incorporated in the lexicon and the remaining few combinatorial rules that build phrases are function application and composition. definite-clause grammars (pereira and warren) can also be encoded in the patr formalism. what these approaches have in common is that syntactic rules and lexieal entries can be written down as sets of attribute-value pairs. moreover, because a value at the end of one path of attributes can be shared by another path, the structures that are generated by such grammars can be thought of as directed graphs cdags"). unification is the key operation for building these structures. because unification is associative and commutative, statements in a unification-based grammar formalism are order-independent and bidirectional with respect to parsing and generation. for a comprehensive introduction tounification-based approaches togrammar, see shieber 1986 (forthcoming). the idea that led to the present version of d-patr was to produce a simple compact system for experimenting with unification-based grammars that would run on machines maller than the symbolics 3600 for which the original tati~ implementation at sri had been created. the first version of i)-patr, initially called }lug, was written at the scandinavian summer workshop for computational linguistics in helsinki, finland, at the end of august 1985. although the actual notation for writing rules in d-patr in some respects differs from the notation in the original pati? system, essentially both systems implement the samegrammar formalism. to emphasize this point, the two implementations are now called z-patr (zeta-lisp patr) and d patr (interlisp-d patr). a number of innovations that came in with l) patr (hug) have since migrated to z-patr. a case in point is the method for minimizing copying in unification that is discussed in the section on parsing and unification. other implementation differences remain--for example, in the parsing algorithm and in the treatment of gaps--but grammars written for d-patr are convertible into z-patr format, and vice versa.d-patr: a deve lopment env i ronment fo r un i f i ca t ion -based grammars lauri karttunen artificial intelligence center sri international 333 ravenswood avenue menlo park, ca 94025 usa and center for the study of language and information stanford university 1 introduction i)-patr is a development environment for unification-based grammars on xerox l i00 series work stations. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C86-1045.txt b/DATASET_PACSUM/dataset/inputs/C86-1045.txt new file mode 100644 index 0000000000000000000000000000000000000000..631bc618c73c3a619ec2150bd3d8ecd5b8b924ac --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C86-1045.txt @@ -0,0 +1 @@ +the work on merging strategies from unification grammars and categorial grammars has its origins in several research efforst that have been pursued in parallel. one of them is the grammar development on the patr system (shieber et al, 1983; shieber, 1984) at sri. for quite a while now i have been using the excellent facilities of patr for the design and testing of experimental\[ cugs. such grammars currently run on two patr implementations: stuart shieber's zetalisp version on the symbolics 3600 and lauri karttunen's interlisp-d w:rsion on the xerox 1109. the work on cugs has influenced our efforts to develop a larger patr grammar, and will do so even more in the future. on the theoretical side, this work is part of ongoing research on such topics as word order variation, modification, and german syntax within projects at sri and csli (stanford university). the structure of the paper eflects the diverse nature of the enterprise. in the first section, i will introduce the basic notions of cugs and demonstrate them through examples in patr notation. the second section discusses the motivation for this work and some of its theoretical implications. the third section sketches a linguistically motivated cug framework with a strong lexical syntax that accomodates word order variation. the paper concludes with a brief discussion of possible cug approaches tolong-distance d pendencies. 1. basic notions of categorial unification. grammars 1.2. unif ication grammars and categorial. grammars both terms, unification grammar (ug) and categorial grammar (cg), stand for whole families of related grammar formalisms whose basic notions are widely known.l yet, for the characterization f the class of formalisms i want to discuss, it will be useful to review the most central concepts of both ug and cg. unification grammar formalisms employ complex feature structures as their syntactic representations. these structures encode partial information about constituents. either term or graph unification is utilized as the main operation for checking, propagating, and merging of the information in these complex representations. most unification grammars also use the complex feature structures for the linking of syntactic and semantic information. in traditional categorial grammars, all information about possible syntactic ombinations of constituents is encoded in their categories. those grammars allow only binary combinations. one of the two combined constituents, the functor, encodes the combination funtion, the other constituent serves as the argument to this function. instead ot7 phrase structure rules, the grammar contains one or, in some formalisms, two combination rules that combine a functor and an argument by applying the function encoded in the functor to the argument constituent. most categorial grammars only combine constituents whose terminal strings concatenate in the input string, but this need not be so. in most categorial grammar formalisms, it is assumed that the syntactic functor-argument structure in the corresponding compositional semantics. 187 there are usually two types of grammatical categories in a categorial grammar, basic and derived ones. basic categories are just category symbols, derived categories are functions from one (derived or basic) category to another. a derived category that encodes a function from category a to category b might be written b/a if the functor combines with an argument to its right or b~, if it expects the argument to its left. thus, if we assume just two basic categories, n and s, then n/s, s/n, n\s, s\n, (s\n)/n, (n/s\(s\(n/n)), etc. are also categories. not all of these categories will ever occur in the derivation of sentences. the set of actually occurring categories depends on the lexical categories of the language. assume the following simple sample grammar: (2) basic categories: n, s lexical categories: n (paul, peter) (s\n)fn (likes) the grammar is used for the sample derivation in (3): (3) peter likes paul n (s\n)fin n skn s it should be clear from my brief description that the defining characteristics of unification grammar have nothing to do with the ones of categorial grammar. we will see that the properties of both grammar types actually complement each other quite wetl. 1.2. a sample cug in patr notat ion since the first categorial unification grammars were written in the patr formalism and tested on the patr systems implemented at sri, and since patr is especially well suited for the emulation of other grammar formalisms, i will use its notation. the representations in patr are directed acyclic graphs (dags) 2 . rules have two parts, a head and a body. the head is a context-free rewrite rule and the body is a dag. here is an example, a simple rule that forms a sentence by combining anoun phrase with a verb phrase. 188 (4) head xo -~ x1, x2 body in unification otation = s = np = vp = body in graph notation xo r s np the rule states that two constituents x1 and x2 can combine to form a constituent x0 if the terminal string covered by x1 immediately precedes the terminal string of x2 and if the dags of x0, x1, and x2 unify with the x0, x1, and x2 subgraphs of the rule body, respectively. i will now show the most straight-forward encoding of a categorial grammar in this notation. there are two types of constituent graphs. constituent graphs for basic categories are of the following form: (5) n s of course, there might be more features associated with the constituent: (6) /oe 7 n s finite 3 sg derived constituents have graphs of the following form: (7) arg (t0b) backward functional application (bfa) value -~ functor argument < value > = < functor va l> = :--: left. this is the graph associated with the vp likes paul: in graph notation: (8) ,. /~ left /~ agr ca~//pers / form cat/pers~nu m s finite n 3 sg it corresponds to the derived-category s mboh (9) s \ n form : finite pers : 3 num: sg (10a) and (10b) are the rules that combine constituents. as in tradit ional categorial grammars, two such rules sufice. (10a) forward functional application (ffa) value -~ functor argument = = = right. in graph notation: val u e~j -~~'~. / funct? r l . ~rgu right ment val u e ~- - j j -~-~rg u ment / left if backward functional application is used to combine the constituents peter and likes paul, the result is a finite sentence. however, if the same rule is applied to the identical constituents likes paul and likes paul, again a finite sentence is obtained. '\]\['his is so because the graph for likes paul actually unifies with the value of arg in the same graph. this can be easily remedied by modifying the graph for the vp slightly. by st ipulat ing that the argument must not have an unfilled argument position, one can rule out derivcd categories as subject arguments tbr the vp: (ii) /0o-i /?e?tum s finite n 3 sg 1.3. extens ions to the basic formal i sm. in this subsection \[want to discuss very briefly a few extensions of' the basic model that make it more suitable for the encoding of natural- language rammars. the first one is the sorting of fimctors according to their own syntactic category. this move might be described alternat ively as defining the type of a constituent as being defined by both a set of syntactic (and semantic) 189 attributes and a function from categories to categories. this function is also expressed as the value of an attribute. for a basic category the value of the function attribute is nil. the following graph is a simplified example of a functor category (prenominal djective in a language with case and number agreement within the np). ~ ~/ ~ ~'~unction ca;~/ ~s: :m - -~gr the combination rules need accordingly. this is the modified functional application. to be changed rule of forward value -~ functor argument = < argument > = < functor function arg > = right. in a traditional categorial grammar, a derived category is exhaustively described by the argument and value categories. but often, syntacticians want to make more fine grained distinctions. an example is vp modification. in a traditional categorial grammar, two different vp modifiers, lets say an adverb and an adverbial clause, would receive the same translation. (12) peter called him angrily n (s\n)fn n (s\n)/(s~q) (13) peter called him at work n (s\n)/n n (s\n)/(s~an) 190 but what should be the category for very? if it receives the category ((s\n)\(s\n))/((s\n)\(s~n)) to allow the derivation of (14), the ungrammatical sentence (15) is also permitted. (14) peter called him very angrily n (s\n)/n n ((s\n)\(sln))/ (s\n)/(s~x\[) ((s\n)\(s~n')) (15) *peter called him very n (s\n)/n n ((s\n)\(s~))/ ((s\n)\(s\n)) at work (s\n)/(s~) if functor categories are permitted to carry features of their own that are not necessarily bound to to any features of their argument and value categories, this problem disappears. adverbs and adverbial clauses could receive different features even if their categories encode the same combination function. another solution to the problem involves the encoding of the difference in the value part of the functor. yet this solution is not only unintuitive but also contradicts a linguistic generalization. it is unintuitive because there is no difference in the distribution of the resulting vps. the only difference holds between the modifiers themselves. the gene~:alization that is violated by the encoding of the difference in the value subgraphs is the endocentricity of the vp. the modified vp shares all syntactic features with its head, the lower vp. yet the feature that indicates the difference between adverbs and adverbial phrases could not be in both the argument and the value parts of the functor, otherwise iterations of the two types of modifiers as they occur in the following pair of sentences would be ruled out. (16a) peter called him very angrily at work. (16b) peter called him at work very angrily. another augmentation is based on the patr strategy for linking syntax and semantics. most grammars written in patr use the constituent graphs also for encoding semantic information. every constituent has an attribute called trans or semantics. the value of this attribute contains minimally the internal semantic fnnction-argument structure of the constituent, but may also encode additional semantic information. the separate encoding of the semantics allows for a compositional semantics even in construction in which syntactic and semantic structure divert as in certain raising constructions. the following graph for a ficticious prenominal adjective that was introduced earlier contains translation attributes for the functor, the argument and the value. the meaning of the adjective is indicated by the atom red. cat ~ / functi% ~rans adj acc ing ~._~g red at first glance, the lexical graphs--even the ones that are used in the highly simplified examples--seem to exhibit an excessive degree of complexity and redundancy. however, the lexical approach to syntax is built on the assumption that the lexicon is structured. to create a lexicon that is structured according to linguistic generalizations, weintroduced lexical templates early on in the development of patr. templates are graphs that contain structure shared by a class of lexical entries. lexical graphs can be partially or fully defined in terms of templates, which themselves can be defined in terms of templates. if a template name appeam in the definition of some graph, the graph is simply unified with the graph denoted by the template. the next augmentation is already built into the formalism. categorial grammarians have recognized the limitations of fimctional application as the sole mode of combining constituents for a long time. one of the obvious extensions to classical categorial grammar was the utilization of functional composition as a further combination mode. a good example of a categorial grammar that employs both functional application and functional composition is steedman (1985). forward functional composition permits the following combination ofcategories: (21) a/b + b/c = a/c the resulting category inherits the argument place for c from the argument b/c. neither steedman's nor any other cg i am aware of permits functional composition i its full generality. in order to prevent overgeneration, functional composition as well as other combination modes that are discussed by steedman are restricted to apply to certain categories only. this somehow violates the spirit of a categorial grammar. steedman's combination rules, for instance, are net universal. in cug, functional composition is subsumed under functional application. it is the functor category that determines whether simple functional application, or functional composition, or either one may take place. conjunction is a good case for demonstrating the versatility. consider the following sentences: 3 (22a) peter andpaul like bananas. (22b) peter likes bananas and paul likes oranges. (22c) peter likes and buys bananas. the conjunction and may combine two simple argument categories (22a), two functors with one unfilled argument position (22b), or two functors with more than one unfilled argument position (22c). if the conjuncts have unfilled argument positions, the conjoined phrase needs to inherit them through functional composition. the simplified lexical graph for and is given under (23). in order to avoid a thicket of crossing edges, i have expressed some of the relevant bindings by indices. 191 (23) c ~ .. r ( the most appealing feature of this way of utilizing functional composition is that no additional combinators are required. no restriction on such a rule need to be formulated. it is only the lexical entries for functors that either demand, permit, or forbid functional composition. extensions to the formalism that i have experimented with that cannot be discussed in the frame of this paper are the use of multiple stacks for leftward and rightward arguments and the dcg-like encoding of the ordering positions in the graphs. in sections 3. and 4., i will discuss further extensions ofthe formalism and specific linguistic analyses. the following section contains a summary of the motivations for working on and with cug and the main objectives of this work.order variat ion worder order variation has always been one of the hardest problems for categorial grammars. the work on merging strategies from unification grammars and categorial grammars has its origins in several research efforst that have been pursued in parallel. the following section contains a summary of the motivations for working on and with cug and the main objectives of this work. one of them is the grammar development on the patr system (shieber et al, 1983; shieber, 1984) at sri. and 4., i will discuss further extensions ofthe formalism and specific linguistic analyses. in sections 3. in a cug that only contains two (or at least very ? few) rules, the first method of duplicating rules appears preferrable over the gap threading approach. rules that propagate gap information might also include rules that permit parasitic gaps along the lines of steedman's rules of functional substitution. for quite a while now i have been using the excellent facilities of patr for the design and testing of experimental\[ cugs. functional composition together with type-raising can be used to obtain all permutations of the sentences that are generated by a traditional categorial grammar. in patr a preprocessor f rules adds this information for all rules in which the grammar writer did not include any gap threading information herself, e.g., for encoding island constraints. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C88-1016.txt b/DATASET_PACSUM/dataset/inputs/C88-1016.txt new file mode 100644 index 0000000000000000000000000000000000000000..44e46301bb101817e716b0d6f9a60f19ee8ae069 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C88-1016.txt @@ -0,0 +1 @@ +the steps of the proposed translation process are: (1) partition the source text into a set of fixed locutioris. (2) use the glossary plus coutextual information to select im corresponding set of fixed ioctttions into a sequen{e forming the target sentence. (3) arrange the words of the talget fixed locutions into a sequence forming the target sentence. we have developed stalistical techniques facilitating both tile autonlatic reation of the glossary, and the performance of tile three translation steps, all on the basis of an aliglnncllt of corresponding sentences in tile two texts. while wc are not yet able to provide examples of french / english tcanslation, we present some encouraging intermediate results concerning lossary creation and the arrangement of target wold seq l ie) lees . introduction in this paper we will outline an approach to automatic translation that utilizes techniques of statistical information extraction from large data bases. these self-organizing techniques have proven successful in the field of automatic speech recognition [1,2,3]. statistical approaches have also been used recently in lexicography [41 and natural anguage processing [3,5,6]. the idea of automatic translation by statistical (information thco,etic) methods was proposed many years ago by warren weaver [711. as will be seen in the body of tile paper, tile suggested technique is based on the availability of pairs of large corresponding texts that are iranslations of each other. i l l particular, we have chosen to work with the english and french languages because we were able to obtain the biqingual l lansard corpus of proceedings of the canadian parliament containing 30 million words of text [8]. we also prefer to apply our ideas initially to two languages whose word orcter is similar, a condition that french and english satisfy. our approach eschews the use of an internmdiate ,nechalfism (language) that would encode the "meaning" of tile source text. the proposal will seem especially radical since very little will be sakl about employment of conventional grammars. this omissiol], however, is not essential, and may only rcllect our relative lack of tools as well as our uncertainty about tile degree of grammar sophistication required.a statistical approach to language translat ion p. brown, j. cocke, s. del i ,a pietra, v. della p ietra, f. jel inek, r, mf, rcf, r, and p. roossin ibm research divis ion t.j. watson research center depar tment of computer science p.o. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C88-2121.txt b/DATASET_PACSUM/dataset/inputs/C88-2121.txt new file mode 100644 index 0000000000000000000000000000000000000000..502645a9778622c2087ebfbcc4ea4f78feec05ed --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C88-2121.txt @@ -0,0 +1 @@ +we argue that even if one extends the domain of locality of cfgs to trees, us- ing only substitution does not givo the freedom to choose the head of each structure. we show how adjunction al- lows us to lexicalize a cfg freely. we then show how a lexicalized grammar naturally follows from the extended omain of locality of tags and present some of the linguistic advantages ofour approach. a novel general parsing strategy for lexicalized gram- mars is discussed. in a first stage, the parser builds a set structures corresponding to the input sentence and in a second stage, the sentence is parsed with respect o this set. the strategy is independent of the linguistic theory adopted and of the underlying rammar formalism. how- ever, we focus our attention on tags. since the set of trees needed to parse an input sentence is supposed to be finite, the parser can use in principle any search strategy. thus, in particular, a top-down strategy can be used since problems due to recursive structures are eliminated. the parser is also able to use non-local information to guide the search. we then explain how the earley-type parser for tags can be modified to take advantage of this approach. *this work is partially supported by aro grant daa29-84-9- 007, darpa grant n0014-85-k0018, nsf grants mcs-82-191169 and dgr-84-10413. the second author is also partially supported by j.w. zellldja grant. the authors would llke to thank mitch marcus for his helpful conunents about this work.parsing strategies with lexicalized grammars: appl icat ion to tree adjoining grammars * yves schabes, anne abe ille**and arav ind k. joshi department of computer and information science university of pennsylvania philadelphia pa 19104-6389 usa schabes~linc.cis.upenn.edu abeille~cis.upenn.edu joshi~eis.upenn.edu abstract in this paper we present a general parsing strategy that arose from the development of an earley-type parsing al- gorithm for tags (schabes and joshi 1988) and from re- cent linguistic work in tags (abeille 1988). \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C88-2128.txt b/DATASET_PACSUM/dataset/inputs/C88-2128.txt new file mode 100644 index 0000000000000000000000000000000000000000..251d424f8a1f61b40742981b573df984906ace40 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C88-2128.txt @@ -0,0 +1 @@ +this tuning allows the parsing system to operate within the same realm of efficiency as previous architectures for parsing alone, but with much greater flexibility for engaging in other processing regimes. 1 introduction the use of a single grammar for both parsing and generation is an idea ~ith a certain elegance, the desirability of which several researchers nave noted. of course, judging the correctness of such a system re- quires a characterization f the meaning of grammars that is indepen- dent of their use by a particular processing, mechanism--that is, the brmalism in which the grammars are expressed must have an abstract ~emantics. as a paradigm example of such a formalism, we might take ~ny of the various logic- or unification-based grammar formalisms. as described by pereira and warren [1983], the parsing of strings ~ccording to the specifications ofa grammar with an independent log- cal semantics can be thought of as the constructive proving of the ;trings grammaticality: parsing can he viewed as logical deduction. -3ut, given a deductive framework that can represent the semantics ff the formalism abstractly enough to be independent of processing, he generation of strings matching some criteria can equally well be hought of as a deductive process, namely, a process of constructive ~roof of the existence of a string that matches the criteria. the dif- erence rests in which information is given as premises and what the ~oal is to be proved. this observation opens up the following possi- bility: not only can a single grammar be used by different processes ~ngaged in various "directions" of processing, but one and the same anguage-processing architecture can be employed for processing the grammar in the various modes. in particular, parsing and generatioa :an be viewed as two processes engaged in by a single parameterized ;heorem prover for the logical interpretation of the formalism. we will discuss our current implementation f such an architecture, ~hich is parameterized in such a way that it can be used either for ~arsing or generation with respect o grammars written in a particular ~rammar formalism which has a logical semantics, the patr formal- sm. furthermore, the architecture allows fine tuning to reflect differ- mt l::ocessing strategies, including parsing models intended to mimic )s~cholinguistic phenomena. this tuning allows the parsing system to )perate within the same realm of efficiency as previous architectures or parsing alone, but with much greater flexibility for engaging in ,ther processing regimes. *this research was sponsored by the nippon telegraph and telephone corpo- ation under a contract with sri international. 2 language processing as deduction viewed intuitively, natural-language-utterance gen ration is a nonde- terministic top-down process of building a phrase that conforms to certain given criteria, e.g., that the phrase be a sentence and that it convey a particular meaning. parsing, on the other hand, is usu- ally thought of as proceeding bottom-up in an effort to determine what properties hold of a given expression.a uniform architecture for parsing and generation stuart m. shieber artificial intelligence center sri international menlo park, california, usa* abst rac t the use of a single grammar for both parsing and generation is an idea with a certain elegance, the desirability of which several researchers have noted. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C88-2147.txt b/DATASET_PACSUM/dataset/inputs/C88-2147.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7255abb9acd0fc2ce1dd3d6a663f63f5ff1533f --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C88-2147.txt @@ -0,0 +1 @@ +1986, kasper et al. 1986] involving the logical formulation of feature structures. s s np / ~ ~ vp det n v i i i the man met u i u 2 u 3 figure 1: initial trees nnnp / det i ~c n i woman 1 int roduct ion tree adjoining grammars (tag) were first introduced by joshi, levy, and takalmshi [joshi et al. the first study of this system, from the point of view of its formal properties and linguistic applicability, was carried out by joshi in [joshi 1985]. tags have been used in providing linguistic analyses; a detailed study of the linguistic relevance was done by kroch and joshi in [kroch et al. in this paper, we show lmw tags can be embedded in a feature struc- ture based framework. feature structure based tree adjoining grammars (ftag) are introduced in section 2, and is f611owed by a comparsion of the descriptive capacity of ftag and tag. a restricted version of ftag is proposed and some possible linguistic stipulations are considered. in section 3, we introduce a calculus, which is an extension of the logical calculus of rounds and kasper [rounds et al. 1986, kasper et al. 1986] allowing a-abstraction and application, in order to describe the structures used in ftags. finally, in section 4, we summarize the work presented in this paper. 1.1 in t roduct ion to t ree ad jo in ing grammars tree adjoining grammars (tag), unlike other grammatical systems used in computational linguistics, is a tree rewriting system. unlike the string rewriting formalisms which writes recursion into the rules that generate the phrase structure, a tag factors reeursion and dependencies into a finite set of elementary trees. the elementary trees in a tag correspond to minimal inguistic structures that localize the dependencies such as agreement, subcategorization, a d filler-gap.feature structures based tree adjoining grammars 1 k. vijay-shanker department of computer and information sciences university of delaware newark, de 19711 u.s.a a. k. joshi del)artment of computer and information science university of pennsylvania philadelphia, pa 19104 u.s.a abstract we have embedded tree adjoining grammars (tag) in a fea- ture structure based unification system. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C90-2067.txt b/DATASET_PACSUM/dataset/inputs/C90-2067.txt new file mode 100644 index 0000000000000000000000000000000000000000..05a01b392123f0fc0123c2152f32566ab1bf5919 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C90-2067.txt @@ -0,0 +1 @@ +automated language understanding requires the determination f the concept which a given use of a word represents, a process referred to as word sense disambiguation (wsd). wsd is typically effected in natural llanguage processing systems by utilizing semantic teature lists for each word in the system's lexicon, together with restriction mechanisms such as case role selection. however, it is often impractical to manually encode such information, especially for generalized text where the variety and meaning of words is potentially unrestricted. furthermore, restriction mechanisms usually operate within a single sentence~ and thus the broader context cannot assist in the disambiguation process. in this paper, we describe a means tor automatically building very large neural networks (vlnns) from definition texts in machine-readable dictionaries, and denmnstrate he use of these networks for wsd. our method brings together two earlier, independent approaches to wsd: the use of machine-readable dictionaries and spreading and activation models. the automatic onstruction of vlnns enables real-size experiments with neural networks, which in turn the authors would like to acknowledge the contributions of st~phanc tlari6 and gavin huntlcy to the work presented in this paper. provides insight into their behavior and design and can lead to possible improvements.automated language understanding requires the determination f the concept which a given use of a word represents, a process referred to as word sense disambiguation (wsd). the model we describe here is only a first step toward a fuller understanding and refinement of the use of vlnns for language processing, and it opens several interesting avenues for further application and research. the use of word relations implicitly encoded in machine-readable dictionaries, coupled with the neural network strategy, seems to offer a promising approach to wsd. provides insight into their behavior and design and can lead to possible improvements. more practically, it is simply difficult to imagine how vectors of several thousands of microfeamrcs for each one of the lens of thousands of words and hundreds of thousands of senses can be realistically encoded by hand. our approach to wsd takes advantage of both strategies outlined above, but enables us to address solutions to their shortcomings. our experimentation with vlnns has also shed light on the role of and need for various other parameters, uch as thresholds, decay, etc. in our model, word nodes corttrol the behavior of sense nodes by means of a differential neuron that prevents, for example, a sense node from becoming more activated than its master word node. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C90-3030.txt b/DATASET_PACSUM/dataset/inputs/C90-3030.txt new file mode 100644 index 0000000000000000000000000000000000000000..56211bcfb3445c1f8a3b19b0189f727d1bcfe4de --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C90-3030.txt @@ -0,0 +1 @@ +the formalism is a linguistic one. it relies on transitional probabilities in an indirect way. the probabilities are not part of the description. the descriptive statements, constraints, do not have the ordinary task of defining the notion correct sentence in l. they are less categorical in nature, more closely tied to morphological features, and more directly geared towards the basic task of pars- ing. we see this task as one of inferring surface structure from a stream of concrete tokens in a basically bottom-up mode. constraints are formu- lated on the basis of extensive corpus studies. they may reflect absolute, ruleqike facts, or probabilistic tendencies where a certain risk is judged to be proper to take. constraints of the former rule-like type are of course preferable. the ensemble of constraints for language l con- stitute a constraint grammar (cg) for l. a cg is intended to be used by the constraint grammar parser cgp, implemented as a lisp interpreter. our input tokens to cgp are morphologically ana- lyzed word-forms. one central idea is to maximize the use of morphological information for parsing purposes. all relevant structure is assigned directly via lexicon, morphology, and simple mappings from morphology to syntax. ]he task of the constraints is basically to discard as many alternatives as possible, the optimum being a fully disambiguated sentence with one syntactic reading only. the second central idea is to treat morphological disambiguation and syntactic labelling by the same mechanism of discarding improper alternatives. 168 a good parsing formalism should satisfy many re- quirements: the constraints should be declarative rather than procedural, they should be able to cope with any real-world text-sentence (i.e.constraint grammar as a framework for parsing running text fred karlsson university of helsinki department of general linguistics hallituskatu 11 sf-00100 helsinki finland e-mail: karlss?n@finuh.bitnet 1. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C90-3044.txt b/DATASET_PACSUM/dataset/inputs/C90-3044.txt new file mode 100644 index 0000000000000000000000000000000000000000..02736a9d1d7b5ec74b16caf3cf22a8baacd0187e --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C90-3044.txt @@ -0,0 +1 @@ +(2) transdr the source matching expression into the target matching expression. (3) construct the target sentence from the target matching expression. this mechanism generates some candidates of translation. to select, the best translation out of them, we define the score of a translation. 1 in t roduct ion use of extracted information fiom examples or example-based translation is becoming the new wave of machine translation. the ba.- sic idea. of example~based translation is very simple: translate a source sentence by imitat- ing the translation example of a similar sen- tence in the database. the idea first appeared in [nagao 84], and some research has followed it [sumita 88][sato 89][sadler 89a. but a great deal of effort is still needed to im- plemenl the idea. in our previous work, we show how to select. the best target word in case-frame translation based on examples[sato 89]. in this paper, we concentrate on two problems: 1. ltow to combine some fragments of trans- lation examph~s in order to translate one sentence? 2. tlow to select tile best tra.nslation out of inany candidates? we show partial solutions for them in mbt2. mbt2 is the second prototype system in our memory-based translation project.. mbt2 ca.n do bi-directional m~nslation between an english word-dependency tree and a japanese word- dependency tree.toward memory--based translation satoshi sato and ma.koto nagao dept. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C90-3045.txt b/DATASET_PACSUM/dataset/inputs/C90-3045.txt new file mode 100644 index 0000000000000000000000000000000000000000..70097e9e296f98795ac0686c56d1ab20c65c34dc --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C90-3045.txt @@ -0,0 +1 @@ +tree-adjoining rammars (tag) constitute a grammat- ical formalism with attractive properties for the strong characterization f the syntax of natural angtmges, that is, characterization of the analysis trees of the expres- sions in the language (kroch and joshi, 1985; kroch, 1989)) among these properties are that o the domain of locality in tags is larger than lot formalisms lhat augment context-free grammars (such as lexical-functkmal, or generalized or head- driven phrase-structure grammar), and ? the statements of dependencies and recursion pos- sibilities in a tree are factored, the former following from primitive dependencies in elementary trees, the latter a consequence of an operatkm of adjunc- tion of trees. these unique properties of tags present a challenge tot the application of tags beyond the limited confines of syntax, for instance, to the task of semantic interpre- tation or automatic tr~mslation of natural anguage. the slandm'd methods of moving beyond syntax to interpre- tation make use in one way or another of the compo- sitional structure of the analysis tree that is manifested in the tree's derivation. any version of compositional 1we assume familiarity throughout the paper with previous work on tags. see, for instance, the introduction by joshi (1987). semantics, or syntax.directed translation relies on such a methodology to some extent. however, in the case of tags, the compositional structure of the tree is not miro rored by its derivational structure, so that a method for constructing semantics based on the compositional syn- tactic structure will be inherently nonderivational, that is, construction of the semantics will be independent of the derivation of the tree, and therefore subsequent. on the other hand, a method mirroring the deriva- tional structure will not necessarily be compositional with respect to tile derived structures of expressions. ai+ tl~ough such a method would be quite different from ttle primarily compositional methods previously postulated, it may have advantages, given that certain aspects of language seem to be noncompositional. (see section 4.) in this paper, we present a varim~t of tags, called synchronous tags, which characterize correstxmdences between languages. the formalism's intended usage is to relate expressions of natural anguages to their asso- ciated semantics represented in a logical form language, or to their translations in another natural language; in summary, we intend the formalism to allow tags to be used beyond their role in syntax proper. we also discuss its application to concrete xamples, and mention some computational issues that arise in its interpretation.the synchronous tag formalism is inherently nondirec- tional. if the representation can be left im- plicit, the optimization can be maintained, but retrieval of explicit representations will be combinatorially more complex. tree-adjoining rammars (tag) constitute a grammat- ical formalism with attractive properties for the strong characterization f the syntax of natural angtmges, that is, characterization of the analysis trees of the expres- sions in the language (kroch and joshi, 1985; kroch, 1989)) among these properties are that o the domain of locality in tags is larger than lot formalisms lhat augment context-free grammars (such as lexical-functkmal, or generalized or head- driven phrase-structure grammar), and ? the statements of dependencies and recursion pos- sibilities in a tree are factored, the former following from primitive dependencies in elementary trees, the latter a consequence of an operatkm of adjunc- tion of trees. derivation is not defined in terms of constructing 6the subscript x on certain nodes is the value of a feature on the nodes corresponding to the variable bound by the quantifier. we also discuss its application to concrete xamples, and mention some computational issues that arise in its interpretation. stan- dard parsing algorithms for both tags and cfgs rely on this optimization. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C90-3052.txt b/DATASET_PACSUM/dataset/inputs/C90-3052.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec376b3e0720b073da5bce85f10482110e1b7d4d --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C90-3052.txt @@ -0,0 +1 @@ +authors describe these extensions as "inheritance grammars", "in- heritance networks", :ii;ature sorts", "typed t~ature structures",...[1, 3, 5, 13, 17, 15, 9, 11, 7, 8]. these formalisms exhibit, to various degrees, one or several of the following properties, characteristic of the so-called object-oriented paradigm: a high level of abstraction, a capacity of inference, modularity and distributed control. abstraction and modular- ity are needed when the linguist wants to describe a hierarchy of concepts (like a lexical hierarchy or the hierarchy of phrasal categories), and to describe linguistic data at different levels (e.g. morphology, syntax, semantics). at first glance it seems rather natural to develop separate modules for different lin- guistic levels, and to describe separately their inter- actions; however, great difficulties are encountered when these modules have to be integrated. usually, there are two choices. either everything is described in a single place using a deeply intricate data struc- ture, like packing both syntactic and semantic equa- tions in cf rules in some lfg extensions (e.g. [10]); the price is a loss in understmtdability and general~ ity. or descriptions are kept separate and the pro- eessing is done accordingly: first, a morphological phase, then a syntactic analysis, and then a semantic analysis, without any communication between these different steps [4]. the price is that interdependent constraints between these levels are lost, resulting in inadequate linguistic description or very complex control strategies at the implementation level. in this paper, we argue that typed unification gram- mars give the linguist a formal framework which has the desirable properties. we will give an introduc- tion to such a formalism, called if,_ (~iyped i"ea- ture structure), which integrates disjunctions, con- junctions and conditional expressions of typed fea- ture structures. this introduction will start from a very simple dcg, and will show how one can write a dcg-like grammar in tfs, making use of the typing system to enforce general constraints valid for classes of objects and to modularize linguistic descriptions. we then show that further abstraction leads to a i-[psg-like grammar. it is not our goal to give here a formal account of the formalism (the interested reader should refer to [2] where a very clear tbrmal semantics on which tfs is based is given), and we will use an informal approach wherever possible.typed unification grammars martin c. emele, dhni zajac project polygloss* university of stuttgart ims~cl/ifl~ais, keplerstrage 17, d - 7000 stuttgart 1, federal republic of germany {emele,zajac} @is.informatik.uni-st ut gart.dbp.de abstract we introduce tfs, a computer formal- ism in the class of logic ibrmaiisms which integrates a powerful type system. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C90-3063.txt b/DATASET_PACSUM/dataset/inputs/C90-3063.txt new file mode 100644 index 0000000000000000000000000000000000000000..15df5b104c1efa585a969cca95b420ec9bd0747d --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C90-3063.txt @@ -0,0 +1 @@ +ttle results of the experiment show that in most of the cases the cooccurrence statistics indeed reflect the semantic onstraints and thus provide a basis {or a useful disambiguat.ion tool. 1 introduct ion the use of selectional constraints i one of the most popular methods in applying semantic information to the resolution of ambiguities in natural anguages. the constraints typically specify which combina- tions of semantic lasses are acceptable in subject- verb-object relationships and other syntactic struc- tures. this information is used to filter ont some analyses of ambiguous constructs or to set prefer- ences between alternatives. though the use of selectional constraints i very popular, there is very little success (if any) in im- plementing this method for broad domains. the major problem is the huge amount of information that must be acquired in order to achieve a rea- sonable representation of a large domain. in order to overcome this problem, our project suggests an alternative to the traditional model, based on auto- matic acquisition of constraints flom a large corpus. the rest of the paper describes how this method is used to resolve anaphora references. similarly, the constraints are used also to resolve syntactic am- biguities, but this will not be described here. the *part of this resemch was conducted wb.ile visiting ibm t. j. watson research center, yorktown ileights, ny reader should bare in mind that like the conven- tional use of selectional constraints, our method is inteuded to work in co,tjunction with other disam- biguation means. these, such as various syntactic and pragmatic onstraints and heuristics [carbonetl and brown p.)88, tlobbs 1978], represent additional levels of knowledge and are essential when selec- tional constraints are not sufficient. 2 the statist ical approach according to the statistical model, cooccurrence patterns that were observed in tile corpns are used as selection patterns. whenever several alternatives are presented by an ambiguous construct, we prefer the one correspot~ding t.omore frequent patterns. when using selectional constraints for anaphora resolution, the referent must satisfy the constraints which are imposed on the anaphor. if the anaphor participates in a certain syntactic relation, like be- ing an object of some verb, then the substitution of the anaphor with the referent must satisfy the selectional constraim.s.automatic processing of large corpora fbr the resolution of anaphor references ido dagan * alon itai computer science department technion, tiaifa, israel dagan~techunix .b i tnet , i ta i~ cs.technion, ac.il abstract manual acquisition of semantic onstraints in broad domains is very expensive. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-1019.txt b/DATASET_PACSUM/dataset/inputs/C92-1019.txt new file mode 100644 index 0000000000000000000000000000000000000000..47a41a5b7c78b2de11ffb974043ab195f9a8c06d --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-1019.txt @@ -0,0 +1 @@ +chinese sentences arc cx)mposed with string of characters without blanks to mark words. however the basic unit for sentence parsing and understanding is word. therefore the first step of processing chinese sentences is to identify the words( i.e. segment the character strings of the sentences into word strings). most of the current chinese natural language processing systems include a processor for word iden- tification. also there are many word segmentation techniques been developed. usually they use a lexicon with a large set of entries to match input sentences \[2,10,12,13,14,21\]. it is very often that there are many l~)ssible different successful matchings. therefore the major focus for word identification were on thc resolu- tion of ambiguities. however many other important as- pects, such as what should be done, in what depth and what are considered to be the correct identifications were totally ignored. high identification rates are claimed to be achieved, but none of them were mea- sured under equal bases. there is no agreement in what extend words are considered to be correctly iden- tified. for instance, compounds occur very often in chi- nese text, but none of the existing systems except ours pay much attention to identify them. proper name is another type of words which cannot be listed exhaus- tively in the lexicon. therefore simple matching algo- rithms can not successfully identify either compounds or proper names. in this paper, we like to raise the ptx~blems and the difficulties in identifying words and suggest the possible solutions.in this paper, we like to raise the ptx~blems and the difficulties in identifying words and suggest the possible solutions. chinese sentences arc cx)mposed with string of characters without blanks to mark words. therefore simple matching algo- rithms can not successfully identify either compounds or proper names. however the basic unit for sentence parsing and understanding is word. proper name is another type of words which cannot be listed exhaus- tively in the lexicon. therefore the first step of processing chinese sentences is to identify the words( i.e. segment the character strings of the sentences into word strings). for instance, compounds occur very often in chi- nese text, but none of the existing systems except ours pay much attention to identify them. most of the current chinese natural language processing systems include a processor for word iden- tification. also there are many word segmentation techniques been developed. there is no agreement in what extend words are considered to be correctly iden- tified. high identification rates are claimed to be achieved, but none of them were mea- sured under equal bases. usually they use a lexicon with a large set of entries to match input sentences \[2,10,12,13,14,21\]. it is very often that there are many l~)ssible different successful matchings. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-1025.txt b/DATASET_PACSUM/dataset/inputs/C92-1025.txt new file mode 100644 index 0000000000000000000000000000000000000000..363f0619eeb8003cc1d1cea5f7294b22b0f809a1 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-1025.txt @@ -0,0 +1 @@ +morphological information on the leaf nodes of trees ? transducers that encode morphological alternations an analysis of an inflected word form is produced by mapping the input form to a sequence of lexical forms through the transducers and by composing some out- put from the annotations on the leaf nodes of the lexical paths that were traversed. comprehensive morphological descrip- tions of this type have been developed for several languages including finnish, swedish, russian, english, swahili, and arabic. although they have several good features, these kimmo-systems also have some limitations. the ones we want to ad- dress in this paper are the following: (1) lexical representations tend to be arbitrary. because it is difficult to write and test two-level systems that map between pairs of radically dissimilar forms, lexical representations i existing two-level analyzers tend to stay close to the surface forms. this is not a problem for morpho- logically simple languages like english because, for most words, inflected forms are very similar to the canonical dictionary entry. except for a small number of irregular verbs and nouns, it is not difficult to create a two-level description for english in which lexical forms coincide with the canonical citation forms found in a dictionary. however, current analyzers for mor- phologically more complex languages (finnish and russian, for example) are not as satisfying in this respect. in these systems, lexical forms typically contain diacritic markers and special symbols; they are not real words in the language. for example, in finnish the lexical counterpart of otin i took might be rendered as ottallln, where t, al, and i1 are an arbitrary encoding of morpho- logical alternations that determine the allomorphs of the stem and the past tense morpheme. the canonical citation form ottaa to take is composed from annotations on the leaf nodes of the letter trees that are linked to match the input. it is not in any direct way related to the lexical form produced by the transducers. (2) morphological categories are not directly encoded as part of the lexical form. instead of morphemes like plural or past, we typically see suffix strings like +s, and +ed, which do not by themselves indi- cate what morpheme they express.two-level morphology with composition lauri karttunen, ronald m. kaplan, and annie zaenen xerox palo alto research center center for the study of language and information stanjbrd university 1. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-1038.txt b/DATASET_PACSUM/dataset/inputs/C92-1038.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cf8e47ef59dd54abcb598d4e6957bd30ac6fa29 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-1038.txt @@ -0,0 +1 @@ +in particular, as compared to these algorithms, human speakers pay far less attention to reducing the length of a referring expression, and far more attention to making sure they use attributes and values that human hearers can easily process; in the terms introduced in [da188,da189], hearers are more concerned with the principle of sensitivity than with the principle of efficiency. we have designed a new referring expression generation algorithm that is based on the~ observations, and believe that the new algorithm is more practical for real-world natu- ral language generation systems than the algorithms we have previously proposed. in particular, the al- gorithm is: ? fast: its run-time is linear in the number of distrac- tors, and independent of the number of possible modifiers; ? sensitive to human preferences: it attempts to use easily perceivable attributes and basic-level [ros78] attribute values; and ? supported by serc grant gr/f/36750. e-mail ad- dress is e.reiter@ed. taiso of the centre for cognitive science at the univer- sity of edinburgh. e-mail address i r. daleqed. ehud re i te r*and rober t da le f depar tment of art i f ic ia l inte l l igence un ivers i ty of ed inburgh ed inburgh eh1 1tln scot land ? domain-independent: he core algorithm should work in any domain, once an appropriate knowl- edge base and user model has been set up. a version of the algorithm has been implemented within the idas natural-language neration system [rml92], and it is performing satisfactorily. the algorithm presented in this paper only gener- ates definite noun phrases that identify an object that is in the current focus of attention. algorithms and models that can be used to generate pronominal and one-anaphoric referring expressions have been presented elsewhere, .g., [sid81,gjw83,da189]. we have recently begun to look at the problem of gen- erating referring expressions for objects that are not in the current focus of attention; this is discussed in the section on future work.a fast algorithm for the generation of referring expressions abst rac t we simplify previous work in the development of algorithms for the generation of referring expre~ sions while at the same time taking account of psy- cholinguistic findings and transcript data. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-2070.txt b/DATASET_PACSUM/dataset/inputs/C92-2070.txt new file mode 100644 index 0000000000000000000000000000000000000000..fafe08fd75baf45bbd67728aced81a3642749175 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-2070.txt @@ -0,0 +1 @@ +our use of class models overcomes this knowledge acquisition bottleneck, enabling training on unresuicted monolingual text without human intervention. applied to the 10 million word groliers encyclopedia, the system correctly disambiguated 92% of the instances of 12 polysemous words that have been previously studied in the literature. problem formulation this paper presents an approach to word sense disambiguation that uses classes of words to derive models useful for disambignating individual words in context. "sense" is not a well defined concept; it has been based on subjective and often subtle distinctions in topic, register, dialect, collocation, part of speech and valency. for the purposes of this study, we will define the senses of a word as the categories li ted for that word in rogers international thesaurus (fourth edition - chapman, 1977). 1sense disambiguation will constitute 1. note that his edition of rogers thesaurus is much more 0ttm$ive than the 1911 vmsion, though somewhat more difficult to obtain in electronic form, one could me other other concept hlemrehics, such as wordnet (miller, 1990) or the ldoce mbject codes (slator, 1991). all that it necessary is ? set of semamic categories and ? list of the words in each category. selecting the listed category which is most probable given the surrounding context. this may appear to be a particularly crude approximation, but as shown in the example below and in the table of results, it is surprisingly successful. i nput output tvr.admillsauachedto cranu were used to lift heavy tools for supplying powe? for cranes, hoists, and lift s. tools hovetlfitheisht,atower crane is oftea med .sb tm* tools ?labocate oaumhip ribalds cranes build ? nest of vegetafi animal are more closely tv.lated to cranes and rails .sn they ran animal low tees ,pp at least five crane species are in danger of animal.word-sense disambiguation using statistical models of rogets categories trained on large corpora david yarowsky at&t bell laboratories 600 mountain avenue murray hil l n j, 07974 yarowsky@research.att .com abst rac t this paper describes a program that disambignates english word senses in unrestricted text using statistical models of the major rogets thesaurus categories. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-2082.txt b/DATASET_PACSUM/dataset/inputs/C92-2082.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0d7ea1c4aa854d6b776e7254f8a1838ee31a2ed --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-2082.txt @@ -0,0 +1 @@ +extensions and applications to areas uch as information retrieval are suggested. 1 in t roduct ion currently there is much interest in the automatic ac- quisition of lexiea[ syntax and semantics, with the goal of building up large lexicons for natural lain guage processing. projects that center around ex- tracting lexical information from machine readable dictionaries (mrds) have shown much success but are inherently limited, since the set of entries within a dictionary is fixed. in order to find terms and ex- pressions that are not defined in mrds we must turn to other textual resources. for this purpose, we view a text corpus not only as a source of information, but also as a source of information about the language it is written in. when interpreting unrestricted, omain-independent text, it is difficult to determine in advance what kind of infbrmation will be encountered and how it will be expressed. instead of interpreting everything in the text in great detail, we can searcil for specific lexical relations that are expressed in well-known ways. sur- prisingly useful information can be found with only a very simple understanding of a text. consider the following sentence: 1. (si) the bow lu te , such as the bambara ndang, is plucked and has an ind iv idual curved neck :for each string. most fluent readers of english who }lave never be- fore encountered the term q3amhara ndang" will nev- ertheless from this sentence infer that a "bambara udang" is a kind of "bow iute". this is true even if tile reader has only a fuzzy conception of what a how lute is. note that the attthor of the sentence is not de- liberately defining the term, as would a dictionary or a childrens book containing a didactic sentence like a bambara ndang is a kind of bow lute. however, the semantics of the lexico-syntactic construction i - dicated by the pattern: (la) npo ..... h as {np1, np2 . (and ior)} np,, are such that they imply (lb) for all np , , 1 < i< n, hyponym(npi, npo) thus from sentence (si) we conclude hyponym ( "barn bare n dang", "how lu re").automatic acquisition of hyponyms ~om large text corpora mart i a. hearst computer science division, 571 evans hall un ivers i ty of cal i fornia, berkeley berkeley, ca 94720 and xerox palo a l to research center mart i~cs , berkeley, edu abst rac t we describe a method for the automatic acquisition of the hyponymy lexical relation from unrestricted text. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-3126.txt b/DATASET_PACSUM/dataset/inputs/C92-3126.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd8280b47662ba8dfdf3ef96d0402af7bad3a598 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-3126.txt @@ -0,0 +1 @@ +since that article is written in dutch, we will translate some parts of it more or less literally in this introduction. according to scba, the current radition of language processing systems is based on linguistically motivated competence models of natural imlguages. "llte problems that these systems lull iato, suggest file necessity of a more perfommnce oriented model of language processing, that takes into account the statistical properties of real language use. qllerefore scha proposes a system ritat makes use of an annotated corpus. anmyzing a new input means that the system attempts to find the most probable way to reconstruct the input out of fragments that already exist in the corpus. the problems with competence grammars that are mentioned in schas aiticle, include the explosion of ambiguities, the fact tilat itunmn judgemeats on grammaticality are not stable, that competence granunars do not account for language h~alge, alld that no existing rule-based grammar gives a descriptively adequate characterization of an actual language. according to scha, tile deveh,pment of a fornml gnatunar fur natural latlguage gets more difficult ,as tire grammar gets larger. when the number of phenotnena one has already takea into account gets larger, the number of iareractions that must be considered when ,me tries to introduce all account of a new pllenomenon grows accordingly. as to tile problem of ,mtbiguity, it has turned out that as soon as a formal gratmnar clmracterizes a non-trivial part of a natural anguage, almost every input sentence of reasonable length gets ml re]manageably large number of different structural analyses (and * the author wishes to thank his colleagues at the department of computational linguistics of the ilaiversity of amsterdam for many fruitful discussions, and, in particular, remko scha, martin van den berg, kwee tjoe l,iong and frodenk somsen for valuable comments on earlier w~rsions of this paper. semantical interpretations). i "lids is problenmtic since most of these interpretations ~re not perceived as lvossible by a hunmn language user, while there are no systematic reasons 111 exclude tileln on syutactic or sematltic grounds. often it is just a ntatter of relative implausibility: tile only reason why a certain iarerpmtarion of a sentence is not perceived, is that aanther interprctatilm is much more plausible. competence and performance tale lhnriations of the current language procossing systerus are not suprising: riley are the direct consequence of rile fact that these systems implement chart]skys notion of a coutpetence grmnmar. the formal grilnuuars that constitute the subject-nmtter of theoretieal linguistics, aim at characterizing the clnnpetencc of tile langnage user. but the preferences language users have m the case of ambiguous entences, are paradigm instances of perfonatmce phenomena.a computational model of language data oriented parsing rens bolt* department of computational i jnguistics university of amsterdmn spuistraat 134 1012 vii amsterdam the netherlands rens@alf.let.uva.nl performance: abstract 1)ata oriented parsing (ix)p) is a model where no abstract rules, but language xt~riences in the ti3ru~ of all ,malyzed colpus, constitute the basis for langnage processing. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C92-3150.txt b/DATASET_PACSUM/dataset/inputs/C92-3150.txt new file mode 100644 index 0000000000000000000000000000000000000000..4504b59dc3bead70eef48fbeae67a9b25b6152bd --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C92-3150.txt @@ -0,0 +1 @@ +in this article, the type of analysis used (surface grammatical nalysis) is highlighted, as the methodological pproach adopted to adapt the rules (experimental approach). i ) const i tu t ing constituting a terminology of a subject field, that is to say establishing a list of the terminological units that represent the concepts of this field, is an oft-encountered problem. for the research development division of electricit6 de france (french electricity board), this problem arose in the information documentation sector. an automatic indexing system, using different thesauri according to the application, has been operational for three years or more [monteil 1990]. the terminologists and information scientists need a terminology a te rmino logy extraction tool in order to keep these thesauri up to date in constantly changing fields and to create "ex nihilo" thesauri for new fields. this is the reason why the terminological extracting software, lexter, was developed, forming the first link in the chain that goes to make up the thesaurus. a corpus of french- language texts is fed into lexter, which gives out a list of likely terminological units, which are then passed on to art expert for validation. aulxs de coling-92, nantes, 23-28 ao~r 1992 9 7 7 proc. of coling-92, nantes, aug. 23-28, 1992 2) what is a terminological unit ? the main aim here is not to provide a rigorous definition of what a terminological unit is, but rather to outline its essential features, and thus to justify the hypotheses (concerning the form of terminological units) on which lexter is based. semantic function : the representation of the concept the first characteristic of the terminological unit is its function as the representation of a concept. the terminological unit plays this role of representation in the framework of a terminology, which is the linguistic evidence of the organisation of a field of knowledge in the form of a network of concepts; the terminological unit represents a concept, uniquely and completely, taken out of any textual context. the existence of this one-to-one relationship between a linguistic expression and an extra-linguistic object is, as we shall see, a situation which particulary concerns the terminological units. the appearance of a new terminological unit is most often a parallel process to that of the birth of the concept which it represents. this "birth" is marked by the consensus of a certain scientific ommunity.surface grammatical analysis for the extraction of terminological noun phrases didier bourigault ecole des hautes etudes en sciences sociales et electlicit6 de france direction des etudes et recherches 1, avenue du g6n6ral de gaulle 92141 clamart cedex france tel : +33 1 47 65 50 64 abstract lexter is a software package for extracting terminology. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-1027.txt b/DATASET_PACSUM/dataset/inputs/C94-1027.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6488c01f134fb53f356241feab82d315de6e83c --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-1027.txt @@ -0,0 +1 @@ +in an utterance, this ambiguity is normally resolved by the context of a word: e.g. in the seutence "the 1977 p6s could store two pages of data. ", store can only be an intluitive. a part-of-speech tagger is a system which automat- ically assigns the part of speech to words using con- textual information. potential applications for part- of-speech taggers exist in many areas inclnding speech recognition, speech synthesis, machine translation and information retrieval. l)ifiereut methods have been used for the im plemen- ration of part-of-speech taggers. taggit (greene, rnbin, 1971), an early system, which was used for the initial tagging of the brown corpus was rule-based. it was able to assign the correct part-of-speech to about 77 % of the words in the brown corpus. in another approach contextual dependencies are modelled statistically. churcb (1988) and kempe (1993) use second order markov models and train their systems on large handtagged corpora. using this metbod, they are able to tag more than 96 % of their test words with the correct part-of-speech. the need for reliably tagged training data, however, is a prob- lem for languages, where such data is not available in sufficient quantities. jelinek (1985) and cutting et al. (1992) circumvent his problem by training their taggers on untagged ata using tile itaum-welch algo- rithm (also know as the forward-backward algorithm). they report rates of correctly tagged words which are comparable to that presented by church (1988) and kempe (1993).part-of-speech tagging with neural networks hehnut schmid institute for computational linguistics, azenbergstr.12, 70174 stuttgart, germany, schmid@ims.uni-stuttgart.de topic area: large text corpora, part-of-speech tag- ging, neural networks 1 abstract text corpora which are tagged with part-of-speech in- formation are useful in many areas of linguistic re- search. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-1032.txt b/DATASET_PACSUM/dataset/inputs/C94-1032.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e0bb71eda08b28627e32e3223baad07420ae2b8 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-1032.txt @@ -0,0 +1 @@ +qhe proposed japanese morphological nalyzer achieved 95. l% recall and 94.6% precision for open text when it was trained and tested on the ati? 1 introduct ion in recent years, we have seen a fair number of l)al)ers re- porting accuracies ofmore than 95% for english part of speech tagging with statistical language modeling tech- niques [2-4, 10, 11]. on the other hand, there are few works on stochastic japanese morphological nalysis [9, 12, 14], and they dont seem to have convinced the japanese nlp community that the statistically-based teclmiques are superior to conventional rule-based tech- niques uch as [16, 17]. we show in this paper that we can buihl a stochastic japanese morphological nalyzer that offers approxi- mately 95% accuracy on a statistical language model- ing technique and an efficient two-pass n-best search strategy. we used tile simple tri-pos model as the tagging model for japanese. probability estimates were ob- tained after training on the ati{ l)ialogue database [5], whose word segmentation a d part of speech tag assignment were laboriously performed by hand. we propose a novel search strategy for getting the n best morphological nalysis hypotheses for the in- put sentence. it consists of the forward dynamic pro- gramming search and the backward a* search. the proposed algorithm amalgamates and extends three well-known algorithms in different fields: the minimum connective-cost method [7] for japanese morphologi- cal analysis, extended viterbi algorithm for charac- ter recognition [6], and "l~ee-trellis n-best search for speech recognition [15]. we also propose a novel method for handling un- known words uniformly within the statistical pproach. using character trigrams ms tim word model, it gener- ates the n-best word hypotheses that match the left- most substrings starting at a given position in the input senten ce. moreover, we propose a novel method for evaluat- ing the performance of morphological analyzers. un- like english, japanese does not place spaces between words. it is difficult, even for native japanese, to place word boundaries consistently because of the aggluti- native nature of the language. thus, there were no standard performance metrics.a stochastic japanese morphological analyzer using a forward-dp backward-a* n-best search algor i thm masa.aki nagata ntt network information systems l~,~bor~ttorics 1-2356 take, yokosuka-shi, kanagaw~t, 238-03 japan (tel) 4-81-468-59-2796 (fax) +81-468-59-3428 (e-mail) nagata@nttnly.ntt . \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-1042.txt b/DATASET_PACSUM/dataset/inputs/C94-1042.txt new file mode 100644 index 0000000000000000000000000000000000000000..486c0fa16d08819475cddb5d015917df18b8896b --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-1042.txt @@ -0,0 +1 @@ +in l)articnlar, the feature set is more detailed than those of the major commercial dictionaries, such ;us the oxford adwmced learners dictionary (oali)) [d] and the longnum dictionary of contemporary english (ldoce) [8], which haw~ i)een widely used as a source o[ lexical i,,for,,lal, ioil ill ];lll- guage analyzers. 1 in addil.ion, we have ahned to be irio,e corrlpreheiisive ill capturhig featt, res (hi partic.u- ]ar, stibcategorization [eatures) than co,ii,llercial dic tlonaries. 2 structure ti le word list was derived fion, the file prepared by prof. roger mitten from the oxford adwn,ced learners dictionary, and contains about 38,000 head forms, although some purely british terms have been omitted, loach entry is organized as a nested set of typed feature-vahle ists. we currently use a lisp-like parenthesized list notation, although the lexicon couhl ito facilii~ate the transit ion to comlex by currenl, users of these dictionaries, we have i)reparcd mappings froln comi,ex classes to those of several other dictionaries. be readily mapped into other hwn,s, such as sc, mi,- marked text, if desired. sollie sauil)le dicticl l ,ary entries are shown ilt f igure 1. the first syml/ol gives the part of speech; a word with several parts of speech will have several dictionary entries, one for each part of speech. each e,itry has all :orth foati lre, g iv ing the base fo,lfl of ti le word, no,ins, verbs, and adjectiw~s with irregular inorphology will liave featt,res for the irregular fo,.iris :plural, :past, :past- part, etc. words which take con-,i)leirients will have a subcatego,ization (:sube) [eat,ire. for exaniple> the verb "ai )andon" eali occur with a ilollri phrase followed by a prepositional phrase with tim preposition "to" (e.g., "1 abandoned hii,i to the linguists.") or with just a ,loll,, phrase comple i f ient ( " [ aballdone(l the shill."). other syntactic features are recorded under :features. for example, the noun "abandon" is marked as (count- able :pval ("wlth")), indicating that it must appear in the singular with a deter,niner unless it is preceded by the preposzion "with". 2.1 subcategor i za t ion we have paid p~uticular attention to providing detailed subcategorization information (information about complement s ructure), both for verbs and for tllose nouns and adjectives which do take cmnl)lements. in order to insure the coml)leteness of our codes, we studied the codiug e)ul)loyed by s(weral other u,ajor texicous, includh,g (,he ihandeis verh lexlcolt 2, the a(jqijii,ex prc, ject [10], the nyu linguistic string lroject [9], the oali), and ia)oci], a, nd, whenever feasime, haw~ sought to incorporate distinctions made in any of these all(tie,tortes.comlex syntax : bu i ld ing a computat iona l lex icon ra lph gr i shm:m, cather ine mac leod, and adam mcyers computer science depar tment , new york un ivers i ty 715 broadw,~y, 7th f loor , new york, ny 10003, u.s.a. {gr i s lnnan ,mac leod ,me.yers } (@cs.nyu.e(ht abstract we des((tile tile design of comlex syntax, a co,nputa- tional lexicon providing detailed syntactic iuformation ff)r approximately 38,000 english headwords. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-1079.txt b/DATASET_PACSUM/dataset/inputs/C94-1079.txt new file mode 100644 index 0000000000000000000000000000000000000000..02567dceb8626069de4142949b393a30d8b61757 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-1079.txt @@ -0,0 +1 @@ +principle-based grammars, such as govern- ment-binding (gb) theory (chomsky, 1981; haegeman, 1991), offer many advantages over rule-based and unification-based grammars, such as the universality ofprinciples and mod- ularity of components in the grammar. prin- ciples are constraints over x-bar structures. most previous principle-based parsers, e.g., (dorr, 1991; font, 1991; johnson, 1991), es- sentially generate all possible x-bar structures of a sentence and then use the principles to fil- ter out the illicit ones. the drawback of this approach is the inefficiency due 1;o the large number of candidate structures to be. filtered out. the problem persists even when w~rions techniques such as optimal ordering of princi- ples (fong, 1991), and corontining (dorr, 1991; johnson, 1991) are used. this problem may also account for the fact that these parsers are experimental nd have limited coverage. this paper describes an efficient, broad- coverage, principle-based parser, called prin- cipar. the main innovation in principar is that it applies principles to descriptions o17 x- bar structures rather than the structures them- selves. x-bar structures of a sentence are only built when their descriptions have satisfied all the pri ncil)les. o dynamic data \ [~ static dala l)rocegsing module data flow figure 1: '.pile architecture of principar figure i shows the architecture of prin- cipar. sentence analysis is divided into three steps. the lexical analyser first converts the in- put sentence into a set of texical items. then, a message passing algorithm for ob-parsing is used to construct a shared parse forest. fi- nally, a parse tree retriever is used to enumer- ate the parse trees. the key idea of the parsing algorithm was presented in (tin, 199:1). this paper presents some implementation details and experimental results.this paper presents some implementation details and experimental results. this re'- search was supported by naturm sciences and engineering research council of canada grant ogp121338. the links in the net- work re.present relationships bel;ween the cat- egories. gb-principles are implemented as lo- cal constraints attached to the nodes and 482 perco la t ion cormtra in ts attached to links in the network. the correct parses for all the sentences in tm)le 1 are returned by the parser. bonnie dorr for comments about sections 1, 2, and 3. f igure'2 depicts ~ port:ion c" tile gr ;unmar network for |dnglish. acknowledgements the author wishes to thanl? our experiments have shown that the parser is very fast. ' 2 \ i t " "ip cpspe~.. table 1 lists the parsing time and the number of parses for several ex- ample sentences. the nodes in tile grammar network represent grammati- cal categories (e.g., np, nbar, n) or subcate- gories, such as v:np (transitive verbs that take nps as complements). , - /~ / \~ i aai ~ i'p " ni i vi : 1 t . . ".,.... when attribute values are used in messages, the complexity of the mgorithm is not yet known. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-2174.txt b/DATASET_PACSUM/dataset/inputs/C94-2174.txt new file mode 100644 index 0000000000000000000000000000000000000000..96467d5c345db506e6b8690e6f298b222f70d8dc --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-2174.txt @@ -0,0 +1 @@ +text types thor(; are. different types of l;exl [exl.s "al)oui," l.he sa.me th ing m~ty be in differing geurcs, of difl(~rem. i y i)es, ;rod of v;trying quality. texts vary along st.ver;d param. el.ers, a.ll relcwull, for l,he gcuera.l inlortlu~tiol~ rel, ri(wal problem of real.thing rea(lcr needs m.i texts. (liven this variat ion, in a text retrieval eonl.ext, the l)rol)lems arc (i) i (mttifying ;cures, and (ii) choosing criteria t,o ch,s-- ter texts of the smnc gem:e, wit, h l)redictal>le l>recision aml rcca.ll. this should uot he eonfused with t, he issue of idenl.ifying topics, m,d choosiug criw+ria that. diserinl- inatc on(: topic from auother. all.hough u(>t orthogonal to gem(, del)endent; wu+iat, ion, the wuiat, ioll i, hat, rela, l,es dirc(-t.ly to (:onw.ui; and topic is moug or, her (litu<.usions. na.l,ura.lly, there is (;o-va.riancc.. iexl.s al)oul. (:(+rl.aitl topics ula,y only occur iu (:(;rt;ailt g(!tll(!s, alt(] {.exl.s ill eertaiu ge.nres may only t.rea.t c(ql.ain topics; mosl. l.ol)- ics do, however, occur iu several ;cures, which is what inl;erests us here. douglas i~il)et: has sl, udied l;exl, variat.ion along scv eral l )aranmtcrs, and found that t,cxt.s can i)(,, cousidcrcd to wvry along live ditnensious. in his st, udy, he clush.rs [~ai.ures according t.o eowuiauce, t.o find tmderlyiug di mens ions (198!)). we wish to liud a method for idenl.ifv- in ; easily eomput.al)h; i)[tl:al,|et.chs t.hat ra.l>idly classify previously illls(?]~{ecogni:zing ]:f:xt genii.es wl r l l s:lb,/l:ll,i,; ~/~i,;ii/i(~s using discii . \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-2178.txt b/DATASET_PACSUM/dataset/inputs/C94-2178.txt new file mode 100644 index 0000000000000000000000000000000000000000..df423a163caab168af40e82411ae4a6c6c879337 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-2178.txt @@ -0,0 +1 @@ +there have been quite a number of recent papers on parallel text: brown et al(1990, 1991, 1993), chen (1993), church (1993), church et al(1993), dagan et al(1993), gale and church (1991, 1993), isabelle (1992), kay and rgsenschein (1993), klavans and tzoukermann (1990), kupiec (1993), matsumoto (1991), ogden and gonzales (1993), shemtov (1993), simard et al(1992), warwick- armstrong and russell (1990), wu (to appear). most of this work has been focused on european language pairs, especially english-french. it remains an open question how well these methods might generalize to other language pairs, especially pairs such as english-japanese and english- chinese. in previous work (church et al 1993), we have reported some preliminary success in aligning the english and japanese versions of the awk manual (aho, kernighan, weinberger (1980)), using charalign (church, 1993), a method that looks for character sequences that are the same in both the source and target. the charalign method was designed for european language pairs, where cognates often share character sequences, e.g., government and gouvernement. in general, this approach doesn't work between languages uch as english and japanese which are written in different alphabets. the awk manual happens to contain a large number of examples and technical words that are the same in the english source and target japanese. it remains an open question how we might be able to align a broader class of texts, especially those that are written in different character sets and share relatively few character sequences. the k-vec method attempts to address this question.k-vec starts by estimating the lexicon. consider the example: fisheries --~ p~ches. the k-vec algorithm will discover this fact by noting that the distribution of fisheries in the english text is similar to the distribution of p~ches in the french. the k-vec method attempts to address this question. the k-vec algorithm generates a quick-and-dirty estimate of a bilingual exicon. this estimate could be used as a starting point for a more detailed alignment algorithm such as word_align (dagan et al, 1993). the concordances for fisheries and p~ches are shown in tables 1 and 2 (at the end of this paper). 2.3 royale languages 2.3 grief grievance 7. these tables were computed from a small fragment ofthe. in this way, we might be able to apply word_align to a broader class of language combinations including possibly english-japanese and english-chinese. 2.4 vanier vanier. canadian hansards that has been used in a number of other studies: church (1993) and simard et al(1992). currently, word_align depends on charalign (church, 1993) to generate a starting point, which limits its applicability to european languages since char_align was designed for language pairs that share a common alphabet. there have been quite a number of recent papers on parallel text: brown et al(1990, 1991, 1993), chen (1993), church (1993), church et al(1993), dagan et al(1993), gale and church (1991, 1993), isabelle (1992), kay and rgsenschein (1993), klavans and tzoukermann (1990), kupiec (1993), matsumoto (1991), ogden and gonzales (1993), shemtov (1993), simard et al(1992), warwick- armstrong and russell (1990), wu (to appear). \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C94-2195.txt b/DATASET_PACSUM/dataset/inputs/C94-2195.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc81b607050ca95a0af76f6a3167dcde1dc0cf37 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C94-2195.txt @@ -0,0 +1 @@ +prel)ositioual phrase attachment disambiguation is a difficult problem. take, for example, the sen- rouge: ( l ) buy a ear \[p,o with a steering wheel\]. we would guess that the correct interpretation is that one should buy cars that come with steer- ing wheels, and not that one should use a steering wheel as barter for purchasing a car. \]n this case, we are helped by our world knowledge about auto- mobiles and automobile parts, and about typical methods of barter, which we can draw upon to cor- rectly disambignate he sentence. beyond possibly needing such rich semantic or conceptual int'ornla- tion, a l tmann and steedman (as88) show that there a,re certain cases where a discourse model is needed to correctly disambiguate prepositional phrase atta.chment. however, while there are certainly cases of an> biguity that seem to need some deep knowledge, either linguistic or conceptual, one might ask whag sort of performance could 1oe achieved by a sys- tem thai uses somewhat superficial knowledge au- *parts of this work done a.t the computer and hp lbrmation science department, university of penn- sylvania were supported by by darpa and afosr jointly under grant no. afosr-90-0066, and by aro grant no. daal 03-89-c0031 pr\[ (first author) and by an ibm gradmtte fellowship (second author). this work was also supported at mit by arpa under con- tract n000t4-89-j-la32= monitored through the office of naval resear<:h (lirst a.uthor). tomatically ~xtracted from a large corpus. recent work has shown thai; this approach olds promise (h\]~,91, hr93). hi this paper we describe a new rule-based ap- proach to prepositional phrase attachment, disam- biguation. a set of silnple rules is learned au- tomatically to try to prediet proper attachment based on any of a number of possible contextual giles. baseline l l indle and rooth (iir91, 1\[17{93) describe corpus-based approach to disambiguating between prepositional phrase attachlnent to the main verb and to the object nonn phrase (such as in the ex- ample sentence above). they first point out that simple attachment s rategies snch as right associa- tion (kim73) and miuimal a.tbtchment (fra78) do not work well i,l practice' (see (wfb90)). they then suggest using lexical preference, estimated from a large corpus of text, as a method of re- solving attachment ambiguity, a technique the}' call "lexical association." from a large corpus of pursed text, they first find all nonn phrase heads, and then record the verb (if' any) that precedes the head, and the preposition (if any) that follows it, as well as some other syntactic inforlnation about the sentence. an algorithm is then specified 1,o try to extract attachment information h'om this table of co-occurrences. i!'or instance, a table entry is cousidered a definite instance of the prepositional phrase attaching to the noun if: '\['he noun phrase occm:s in a context where no verb could license the prepositional phrase, specifically if the noun phrase is in a subjeet or other pre-verbal position. they specify seven different procedures for decid- ing whether a table entry is au instance of no attachment, sure noun attach, sm:e verb attach, or all ambiguous attach. using these procedures, they are able to extract frequency information, 1198 counting t, he numl)e,r of times a ptu:ticular verb or ncmn a.ppe~u:s with a pal:tieuh~r l~reposition. these frequen(;ies erve a.s training d~t;a for the statistical model they use to predict correct i~ttachmenl to dismnbigu;~te s ntence (l), they would compute the likelihood of the preposition with giwm the verb buy, {rod eolltrast that with the likelihood of that preposition given i:he liottll whed. ()he, problem wit;h this ,~pproa~ch is tll~tt it is limited in what rel~tionships are examined to make mi ~d;tachment decision. simply extending t\[indle and l{,ooth's model to allow r)r relalion- ships such as tlml~ i)e.tweell the verb and the' ob- ject o\[' the preposition would i:esult ill too large a. parameter spa.ce, given ~my realistic quantity of traiuing data. another prol)lem of the method, shared by ma.ny statistical approaches, is that the. model ~(:quired (inring training is rel)reser~ted in a huge, t~d)le of probabilities, pl:ecludiug any stra.ightf'orward analysis of its workings. ' l~-ansformat ion-based er ror -dr iven learn ing tra, ns\]bl'm~d;ion-lmsed errol:-dhven learlting is ~ sin@e learning a.lgorithm tlmt has t)eeu applied to a. number of natural la.ngm,ge prol)ie.ms, includ- jllg l)a.t't o\[' speech tagging and syuta.cl, ic l)m:sing (1h:i92, \]h:i93a, bri!)gb, bri9d). figure :1 illus- trates the learning l)l:occ'ss, l:irsl, tlll;21nlola, ted text; is l)assed through the initial-st;ate mmota- tot. 'l'lw~ initial-stat, e area)tater can range in com- plexity from quite trivial (e.g. assigning rmtdom strll(:ttll:c) to quit, e sophistica.ted (e.g. assigning the output of a. i{nowledge-based ;/l/llot;~l, tol' that was created by hand). ouce text has beeu passed through the iuitia.l-state almol, at.or, it. is then (;ore- pared to the h'ugh,, as indicated ill a luamlally an- nota,tea eorl)lls , and transformations are le~u'ned that can be applied to the oul, put of the iuitial state remora, tot t;o make it, better resemble the :ruffs. so far, ouly ~ greedy search al)proach as been used: at eaeh itera.tion o\[' learning, t.he tra nsfo> nl~tion is found whose application results in the greatest iml)rovenmnt; ha.t transfk)rmation is then added to the ordered trmlsforlmltiou list and the corpus is upd~d.ed by a.pplying the. learned trans formation. (see, (i{,mg,\[) for a detailed discussiou of this algorithm in the context of machiue, le, aru-- iug issues.) ottce 3,11 ordered list; of transform~tions i learned, new text, can be mmotated hy first ai> plying the initial state ~mnotator to it and then applying each o\[' the traam'ormations, iu order. unannotati{d \] "i'i~x'i' 1nh'\[ai, l state annotatlid text ti~.i j'\['l l , ~ e , n el( ~-~ rui ,i-s figure \[: transfonm~tion-i~ased error.-driven l,earlfiug. r lh:ansformation-b ased prepos i t iona l phrase at tachment we will now show how transformation-based e.rrol> driwm igmfing can be used to resolve prep(~si- tiered phrase at, tachment ambiguity. the l)reposi- tioiml phrase a.tt~munent |ea.riter learns tra.nsfor-- ill~ttiolls \[?onl a c,)l:l>tls o\[ 4-tuples of the \['orm (v i11 i\] 1|9), where v is ~1 w;rl), nl is the head of its objecl, llolni \]phrase, i ) is the \])l'epositioll, and 11:2 is the head of the noun phrase, governed by the prel)c, sition (for e,-:anq~le, sce/v :1~' bo:q/,l o,/p the h711/~2). 1,'or all sentences that conlbrm to this pattern in the penn treeb~mk w{dl st, l:eet 3ourlml corpns (msm93), such a 4-tuplc was formed, attd each :l-tuple was paired with the at~aehnteut de- cision used in the treebauk parse) '\['here were 12,766 4q;ul)les in all, which were randomly split into 12,206 trnining s**mples and 500 test samples. \[n this e?periment (as in (\[ii~,9\], i\]l{93)), tim at- tachment choice for l)repositional i)hrases was i)e- i,ween the oh.iecl~ mmn and l,he matrix verb. \[n the initial sl,~te mmotator, all prepositional phrases i \])at.terns were extra.clxxl usj.ng tgrep, a. tree-based grep program written by rich pito. '\]'\]te 4-tuples were cxtract;ed autom~tk:ally, a.ud mista.kes were not. m~vn tta.lly pruned out. 1199 are attached to the object, noun. 2 this is tile at- tachment predicted by right association (kim73). the allowable transforlnations are described by the following templates: ? change the attachment location from x to y if: - n l i sw - n2 is w - v isw -- p is w - n l is w1 and n2 is w2 - n l i swl andv isw2 here "from x to y" can be either "from nl to v" or "from v to nl ," w (w1, w2, etc.) can be any word, and the ellipsis indicates that the complete set of transformations permits matching on any combination of values for v, n l , p, and n2, with the exception of patterns that specify vahms for all four. for example, one allowable transformation would be change the attachment location from nl to v if p is "until". learning proceeds as follows. first, the train- ing set is processed according to the start state annotator, in this case attaching all prepositional phrases low (attached to nl) . then, in essence, each possible transtbrmation is scored by apply- ing it to the corpus and cornputing the reduction (or increase) in error rate. in reality, the search is data driven, and so the vast majority of al- lowable transformations are not examined. the best-scoring transformation then becomes the first transformation i the learned list. it is applied to the training corpus, and learning continues on the modified corpus. this process is iterated until no rule can he found that reduces the error rate. in the experiment, a tol, al of 471 transfor- mations were learned - - figure 3 shows the first twenty. 3 initial accuracy on the test set is 64.0% when prepositional phrases are always attached to the object noun. after applying the transforma- tions, accuracy increases to 80.8%. figure 2 shows a plot of test-set accuracy as a function of the nulnber of training instances. it is interesting to note that the accuracy curve has not yet, reached a 2if it is the case that attaching to the verb would be a better start state in some corpora, this decision could be parameterized. zin transformation #8, word token amount appears because it was used as the head noun for noun phrases representing percentage amounts, e.g. "5%." the rule captures the very regular appearance in the penn tree- bank wall street journal corpus of parses like sales for the yea," \[v'p rose \[np5yo\]\[pp in fiscal 1988\]\]. accuracy 81.00 rl 80.00 !! 79,00 t 77.00 !--r . . . / - -f . . . %oo!1 / i 74:001 . . . _ _ t .... _ _ 73.00 j - 72.00 l l i _ __ / __ . ,?!>2 - 70.00 69.00 68.00 67.00 64.00 0.00 5.00 i q ! i t t!aining size x 103 10.00 figure 2: accuracy as a function of l;raining corpus size (no word class information). plateau, suggesting that more training data wonld lead to further improvements. adding word class in format ion in the above experiment, all trans\[brmations are. triggered hy words or groups of words, and it is surprising that good performance is achieved even in spite of the inevitable sparse data problems. there are a number of ways to address the sparse data problem. one of the obvious ways, mapping words to part of speech, seerns unlikely to help. h> stead, semanl, ic class information is an attraclive alternative. we incorporated the idea of using semantic ino tbrmation in the lbllowing way. using the word~ net noun hierarchy (milg0), each noun in the ffa{ning and test corpus was associated with a set containing the noun itself ph.ts the name of every semantic lass that noun appears in (if any). 4 the transformation template is modified so that in ad- dition to asking if a nmm matches ome word w, 4class names corresponded to unique "synonynl set" identifiers within the wordnet noun database. a noun "appears in" a class if it falls within the hy- ponym (is-a) tree below that class. in the experiments reported here we used wordnet version :l.2. 1200 1 2 4 5 (3 7 8 9 10 ii 12 :13 \]4 15 \[6 17 \[8 119 2()_ change att{:~ehment location l"r~m~ to ( ;omit ion n1 v p is at n\ ] \ / p is as n1 v i ) is iulo n:i \/ p is ,l}'om n:i v p is with n\] v n2 is year n 1 v p is by i? is i~ and n i v ni ix amounl n \[ \/ \]' is lhrough ni v \]) is d'urb~g ni v v ix p,ul n1 v n2 is mou.lk n\[ v 1' is ulldcr nj v 1 ) is after v is have and n1 v i' is b~ n:\[ v p is wilk.oul v ni p is of v is buy and n1 \/ p is for n:i v p is bejbl"( v is have and ni v p is o~ x/ l( v v ~ v v / v v ,/ figure 3: the \[irst 20 transforntat;ions learned tbr preposil;ional phrase ~ttachme, n|;. it: (~an a/so ask if" it is a~ member of some class c. s this al)proaeh i;o data. sparseness i similar to tllat of (l{,es93b, li, l\[93), where {~ method ix proposed for using wordnet in conjunction with a corpus to ohtain class-based statisl, ie,q. ()lit' method here is ltlllc\]l simpler, however, in i;hat we a.re only us- ing boolean values to indieal;e whel;her ~ word can be a member of' a class, rather than esl, imat ing ~ filll se{, of joint probabil it ies involving (:lasses. since the tr;ulsformation-based al)l/roach with classes ccm gener~dize ill a way that the approach without classes is ml~l)le to, we woldd expect f'cwer l;ransf'ormal;ions to be necessary, l!;xperimeah, ally, this is indeed the case. in a second experiment;, l;raining a.ml testing were era:tied out on the same samples as i , the previous experiment, bul; i;his t ime using the ext, ende, d tra ns lbrmat ion t(;ml)la.tes for word classes. a total of 266 transformations were learned. applying l.hese transt'ormai.ions to the test set l'eslllted in a.n accuracy of' 81.8%. \[n figure 4 we show tile lirst 20 tra.nsform{~l, ions lem'ned using ilollll classes. class descriptions arc surrounded by square bracl{ets. (; 'phe first; grans- ibrmation st~l.cs thai. if" n2 is a. nomt i, hal; describes time (i.e. ix a. member of wordnet class that in- cludes tim nouns "y(;ar," "month," "week," and others), thell the preltositiomd phrase should be al;tache(\[ t,() the w;rb, since, tim(; is \]nlml more l ikely io modify a yet'it (e.g. le,vc lh(: re(cling iu an hour) thajl a, lloun. this exlw, r iment also demonstrates how rely \[~?~l;ul:e-based lexicon or word classiflcat, ion scheme cau triviajly be incorljorated into the learner, by exlencling l;ransfot'nlal,iolls to allow thent to make l'efel'ealc(? |;o it wol:(\[ g i l t \ [ { l i ly o\[' its features. \],valuation against other algorithms in (lil~91, hr93), tra.inittg is done on a superset el' sentence types ttsed ill train- ing the transforlj~atiolfbased learner. the transformation-based learner is i, rained on sen- tences containing v, n\[ and p, whereas the algo- r i thm describe.d by l l indle and i~,ooth ca.n zdso use sentences (;ontailfing only v and p, (n' only nl and i1. \[11 their lmper, they tra.in on ow~r 200,000 sen- lettces with prel)ositions f'rotn the associated press (apt newswire, trod i;hey quote a.n accuracy of 78- 80% on ap test &~ta.. ~' for reasons of ~: u n- t ime c\[lk:icn(:y, transfonmll, ions tmddng re\['crence 1:o tile classes of both n l a,nd n2 were iloi; p(~l?lxiitl, tr(i. gi;or expository purposes, the u.iqm'. wordnet id('.ntilicrs luwe been replaced by words lh~ll describe the cont, cnt of the class. 1207 (~lml~.ge \] at tachment , / location / # li'rom t 'fo \[ condition 1 n1 v n2 is \[time\]prel)ositioual phrase attachment disambiguation is a difficult problem. 1207 (~lml~.ge \] at tachment , / location / # li'rom t 'fo \[ condition 1 n1 v n2 is \[time\] 4 n1 v p is into 5 n 1 v p is from 6 n1 v 1 ) is wilh 7 n1 v p is of p is in and ni is 8 n 1 v \[measure, quanlily, amou~l\] p is by all.el 9 n1 v n2 is \[abslraclion\] i 0 ni v p is lhro'ugh 1) is in and n i is 11 ni v \[group,group.in.g\]. wordnet id('.ntilicrs luwe been replaced by words lh~ll describe the cont, cnt of the class. ilow- ever, this l)eeomes less of a probh'.m as atmotated eorl}ora beeolne increasingly available, and sug- gests the comhinat ion o1:' supexvised and uusuper vised methods as a.u ilfl;eresth g ave\]me \['or \['urther rese;ire\] \[. take, for example, the sen- rouge: ( l ) buy a ear \[p,o with a steering wheel\]. \]n this case, we are helped by our world knowledge about auto- mobiles and automobile parts, and about typical methods of barter, which we can draw upon to cor- rectly disambignate he sentence. we would guess that the correct interpretation is that one should buy cars that come with steer- ing wheels, and not that one should use a steering wheel as barter for purchasing a car. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-1005.txt b/DATASET_PACSUM/dataset/inputs/C96-1005.txt new file mode 100644 index 0000000000000000000000000000000000000000..30fa3f36db9a256afbf8171c77ebf0d34c4eec65 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-1005.txt @@ -0,0 +1 @@ +pau gargallo 5, 08028 barcelona. g.rigau@lsi.upc.es abst ract . this paper presents a method for the resolution of lexical ambiguity of nouns and its automatic evaluation over the brown corpus. the method relies on the use oil the wide-coverage noun taxonomy of wordnet and the notion of conceptual distance among concepts, captured by a conceptual density formula developed for this purpose. this fully automatic method requires no hand coding of lexical entries, hand tagging of text nor any kind of training process. the results of the experiments have been automatically evaluated against semcor, the sense-tagged version of the brown corpus. 1 int roduct ion much of recent work in lexical ambiguity resolution offers the prospect hat a disambiguation system might be able to receive as input unrestricted text and tag each word with the most likely sense with fairly reasonable accuracy and efficiency. the most extended approach use the context of the word to be disambiguatcd together with inlormation about each of its word senses to solve this problem. interesting experiments have been performed in recent years using preexisting lexical knowledge resources: [cowie el al. 92], [wilks et al. 93] with ldoce, [yarowsky 92] with rogets international thesaurus, and [sussna 93], [voorhees 9311, [richardson etal. 94], [resnik 95] with wordnet. although each of these techniques looks promising for disambiguation, either they have been only applied to a small number of words, a few sentences or not in a public domain corpus. for this reason we have tried to disambiguate all the nouns from real *eneko agirre was supported by a grant from the basque goverment. part of this work is included in projects 141226-ta248/95 of the basque country university and pi95-054 of the basque government.word sense disambiguation using conceptual density eneko agirre* lengoaia eta sistema informatikoak saila. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-1021.txt b/DATASET_PACSUM/dataset/inputs/C96-1021.txt new file mode 100644 index 0000000000000000000000000000000000000000..42cbe4b7224a5c42b923245052749c738accd2b1 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-1021.txt @@ -0,0 +1 @@ +(l,appin and leass, 1994) describe an algorithm for pronominal anaphora resolution with high rate of cor- rect analyses. while one of the strong points of this algorithm is that it operates primarily on syntactic in- formation ahme, this also turns out to be a limiting factor for its wide use: current state-of-the-art of prac- tically applicable parsing technology still falls short of robust and reliable delivery of syntactic analysis of real texts to the level of detail and precision that the filters a nd constraints described by i ,appin and l ,eass assume. we are particularly interested in a class of text pro- cessing applications, capable of delivery of content analysis to a depth inw~lving non-trivial amount of discourse processing, including anaphora resolution. the operational context prohibits us from making any assumptions concerning domain, style, and genre of input; as a result, we have developed a text processing framework which builds its capabilities entirely on the basis of a considerably shallower linguistic analysis of the input stream, thus trading off depth of base level analysis for breadth of cown:age. in this paper, we present work on modifying the lmp- pin/leass algorithm in a way which enables it to work off a flat morpho-syntactic analysis of the sentences of a text, while retaining a degree of quality and accuracy in pronorainal anaphora resolution comparable to that reported in (lappin and l,eass, 1994). the modifica- tions discussed below make the algorithm available to a wide range of text processing frameworks, which, due to the lack of full syntactic parsing capability, nor- really would have been unable to use this high preci- sion anap hora resolution tool. the work is additionally important, we feel, as it shows that informatkm about the content and logical structure of a text, in princi-. pie a core requirement for higher level semantic and discourse processes, can be effectively approximated by the right mix of constituent analysis and inferences about functional relations.lappin and leass' algorithm for pronominal anaphora resolution is capable of high accuracy, but requires in- depth, full, syntactic parsing of text. (l,appin and leass, 1994) describe an algorithm for pronominal anaphora resolution with high rate of cor- rect analyses. the overall success of the algo- rithm is important, then, not only for the immediate utility of the particular modifications, but also because the strategy we have developed for circumventing the need for full syntactic analysis is applicable to other in- terpretation tasks which, like the problem of anaphora resolution, lie in the space of higher level semantic and discourse analysis. pie a core requirement for higher level semantic and discourse processes, can be effectively approximated by the right mix of constituent analysis and inferences about functional relations. the base level linguistic analysis for actaphora resolu- tion is the output of a part of speech tagger, augmented with syntactic function annotatkms for each input to. in the event of a tie, the candidate which most immediately precedes the anaphor is selected as the antededent (where prece- dence is determined by comparing offset values). the coref value of the pronoun is set to that of the an- tecedent, adding it to the the antecedent's coref class, and the salience of the class is recalculated accordingly. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-1055.txt b/DATASET_PACSUM/dataset/inputs/C96-1055.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5f39ea1be1c8180a1bacedcfb2a407dfe352401 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-1055.txt @@ -0,0 +1 @@ +finally, we show that we can provide effective acquisition techniques for novel word senses using a combi- nation of online sources. 1 in t roduct ion this paper addresses the issue of word-sense ambigu- ity in extraction from machine-readable resources for the construction of large-scale knowledge sources. we describe two experiments: one which ignored word- sense distinctions, resulting in 6.3% accuracy for seman- tic classification of verbs based on (levin, 1993); and one which exploited word-sense distinctions, resulting in 97.9% accuracy. these experiments were dual pur- pose: (l) to validate the central thesis of the work of (levin, 1993), i.e., that verb semantics and syntactic be- havior are predictably related; (2) to demonstrate hat a 15-fold improvement can be achieved in deriving se- mantic information from syntactic ues if we first divide the syntactic ues into distinct groupings that correlate with different word senses. finally, we show that we can provide effective acquisition techniques for novel word senses using a combination of online sources, in particular, longmans dictionary of contemporary en- glish (ldoce) (procter, 1978), levins verb classifica- tion scheme (levin, 1993), and wordnet (miller, 1985). we have used these techniques to build a database of 10,000 english verb entries containing semantic infor- mation that we are currently porting into languages such as arabic, spanish, and korean for multilingual nlp tasks such as foreign language tutoring and ma- chine translation. 322 2 automat ic lex ica l acqu is i t ion fo r nlp tasks as machine-readable resources (i.e., online dictionaries, thesauri, and other knowledge sources) become read- ily available to nlp researchers, automated acquisition has become increasingly more attractive. several re- searchers have noted that the average time needed to construct a lexical entry can be as much as 30 min- utes (see, e.g., (neff and mccord, 1990; copestakc et al., 1995; walker and amsler, 1986)). given that we are aiming for large-scale lexicons of 20-60,000 words, automation of the acquisition process has become a ne- cessity. previous research in automatic acquisition focuscs primarily on the use of statistical techniques, such as bilingual alignment (church and hanks, 1990; kla- vans and tzoukermann, 1996; wu and xia, 1995), or extraction of syntactic constructions from online dic- tionaries and corpora (brant, 1993; dorr, garman, and weinberg, 1995). others who have taken a more knowledge-based (interlingual) approach (lonsdale, mi- tamura, and nyberg, 1996) do not provide a means for systematically deriving the relation between sur- face syntactic structures and their underlying semantic representations. those who have taken more argument structures into account, e.g., (copestake t al., 1995), do not take full advantage of the systematic relation be- tween syntax and semantics during lexical acquisition. we adopt the central thesis of levin (1993), i.e., that the semantic class of a verb and its syntactic behav- ior are predictably related. we base our work on a correlation between semantic classes and patterns of grammar codes in the longmans dictionary of con- temporary english (ldoce) (procter, 1978). while the ldoce has been used previously in automatic x- traction tasks (alshawi, 1989; farwell, guthrie, and wilks, 1993; boguraev and briscoe, 1989; ,wilks et al., 1989; wilks et al., 1990) these tasks are primarily con- cerned with the extraction of other types of informa- tion including syntactic phrase structure and broad ar- gument restrictions or with the derivation of semantic structures from definition analyses.role of word sense disambiguation i lexical acquisition: predicting semantics from syntactic cues bonn ie j. dor r and doug jones depar tment of computer sc ience and ins t i tu te for advanced computer stud ies un ivers i ty of mary land a.v. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-1058.txt b/DATASET_PACSUM/dataset/inputs/C96-1058.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e48d95ec270d545af7e61d53b1b6b2b6fc23c26 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-1058.txt @@ -0,0 +1 @@ +in these results, the generative model performs significantly better than the others, and does about equally well at assigning pa.rt- of-speech tags. 1 in t roduct ion in recent years, the statistical parsing community has begun to reach out; for syntactic formalisms that recognize the individuality of words, l,ink grammars (sleator and pemperley, 1991) and lex- icalized tree-adjoining ranunars (schabes, 1992) have now received stochastic treatments. other researchers, not wishing to abandon context-flee grammar (ci"g) but disillusioned with its lexica] blind spot, have tried to re-parameterize stochas- tic ci"g in context-sensitive ways (black et al., 1992) or have augmented the formalism with lex- ical headwords (magerman, 1995; collins, 11996). in this paper, we 1)resent a [lexible l)robat)ilistic parser that simultaneously assigns both part-of- sl)eech tags and a bare-bones dependency struc- ture (illustrate.d in l!igure 1). the choice o t a simple syntactic structure is deliberate: we would like to ask some basic questions about where hx- ical relationships al)pear and how best, to exploit *this materia.l is based upon work supported un- der a national science i%undation graduate fellow- ship, and has benefited greatly from discussions with mike collins, dan m(:lame(l, mitch marcus and ad- wait ratnaparkhi. (a) tile man in the coiner taught his dachsht , ld io play gol f i;os dt nn in dt nn vbd pp.p$ nn to vh nn /? man n ~.. phty~ j j - y , .% (b) the ill __ ~ / .~dachshund it) gol f . ) f cofllel his file figure 1: (a) a bare-l>ones dependen(-y parse. ]]a<:h word points to a single t)arent, the word it modities; the head of the sentence points to the eos (end-of: sentence) ma.rk. crossing links and cycles arc not al- lowed. (b) constituent structure and sub(:ategoriza- tion may be highlighted by displaying the same de- pendencies as a lexical tree. it is uscflfl to look into thes0 basic ques- tions before trying to tine-tmm the performance of systems whose behavior is harder to understand. 1 the main contribution of the work is to i)ro- pose three distin(t, lexiealist hyl)otheses abou(. (,he probability space underlying sehl]ence structure. we il]ustrate how each hypothesis is (:xl)ressed in a depemteney framework, and how each can be used to guide our parser toward its favored so- lution.three new probabi l is t ic mode ls for dependency parsing: an exploration* j ason m. e i sner cis depar tment , un ivers i ty of pe lmsy lva i f ia . \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-1079.txt b/DATASET_PACSUM/dataset/inputs/C96-1079.txt new file mode 100644 index 0000000000000000000000000000000000000000..356d7ebcb735b1e07e8911864f2fafb8f5dc83ad --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-1079.txt @@ -0,0 +1 @@ +1 the muc evaluations we have just completed the sixth in a series of message understanding conferences, which have been organized by nrad, the rdt&e division of the naval command, control and ocean surveil- lance center (formerly nosc, the naval ocean systems center) with the support of darpa, the defense advanced research projects agency. this paper looks briefly at the history of these conferences and then examines the considerations which led to the structure of muc-6} the message understanding conferences were initiated by nosc to assess and to foster research on the automated analysis of military messages containing textual information. although called "conferences", the distinguishing characteristic of the mucs are not the conferences themselves, but the evaluations to which participants must submit in order to be permitted to attend the conference. for each muc, participating roups have been given sample messages and instructions on the type of information to be extracted, and have developed a system to process uch messages. then, shortly before the conference, participants are given a set of test messages to be run through their system (without making any changes to the system); the output of each participants system 1the full proceedings of the conference are to be distributed by morgan kaufmann publishers, san ma- teo, california; earlier muc proeeedings~ for muc-3, 4, and 5, are also available from morgan kaufmann. beth sundheim naval command, control and ocean surveillance center research, development, test and evaluation division (nrad) code 44208 53140 gatchell road san diego, cmifornia 92152-7420 sundhe im@poj ke . mi l is then evaluated against a manually-prepared an- swer key. the mucs are remarkable in part because of the degree to which these evaluations have defined a prograin of research and development. darpa has a number of information science and technol- ogy programs which are driven in large part, by regular evaluations. the mucs are notable, how- ever, in that they in large part have shaped the research program in information extraction and brought it to its current state} 2 early history muc-1 (1987) was basically exploratory; each group designed its own format for recording the information in the document, and there was no formal evaluation. by muc-2 (1989), the task had crystalized as one of template filling. one re- ceives a description of a class of events to be iden- tiffed in the text; for each of these events one must fill a template with information about the event. the template has slots for information about the event, such as the type of event, the agent, the time and place, the effect, etc. for muc-2, the template had 10 slots. both muc-1 and muc- 2 involved sanitized forms of military messages about naval sightings and engagements.message unders tand ing conference - 6: a br ie f h is tory ralph grishman dept. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-2141.txt b/DATASET_PACSUM/dataset/inputs/C96-2141.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0ad0358d3c49579bd42fd4f9bb123cef5d2f2ea --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-2141.txt @@ -0,0 +1 @@ +in this paper, we address the problem of word alignments for a bilingual corpus. in the recent years, there have been a number of papers con- sidering this or similar problems: (brown et al, 1990), (dagan et al, 1993), (kay et al, 1993), (fung et al, 1993). in our approach, we use a first-order hidden markov model (hmm) (aelinek, 1976), which is similar, but not identical to those used in speech recognition. the key component of this approach is to make the alignment probabilities dependent not on the absolute position of the word align- ment, but on its relative position; i.e. we consider the differences in the index of the word positions rather than the index itself. the organization of the paper is as follows. after reviewing the statistical approach to ma- chine translation, we first describe the convention- al model (mixture model). we then present our first-order hmm approach in lull detail. finally we present some experimental results and compare our model with the conventional model.a key issne in modeling the string translation probability pr(j'~le i) is the question of how we define the correspondence b tween the words of the english sentence and the words of the french sentence. finally we present some experimental results and compare our model with the conventional model. in this paper, we address the problem of word alignments for a bilingual corpus. the ultimate test of the different alignment and translation models can only be car- ried out in the framework of a fully operational translation system. models describ- ing these types of dependencies are referred to as alignment models. in typical cases, we can assume a sort of pairwise dependence by considering all word pairs (fj, ei) for a given sentence pair i.-/1\[~'j', elqlj' we fur- ther constrain this model by assigning each french word to exactly one english word. we are presently studying and testing a nmltilevel hmm model that allows only a small number of large jumps. this research was partly supported by the (\]er- man federal ministery of education, science, t{e- search and technology under the contract num- ber 01 iv 601 a (verbmobil) and under the esprit research project 20268 'eutrans). this could be especially helpful for languages uch as german, where compound words are matched to several words in the source language. \ No newline at end of file diff --git a/DATASET_PACSUM/dataset/inputs/C96-2183.txt b/DATASET_PACSUM/dataset/inputs/C96-2183.txt new file mode 100644 index 0000000000000000000000000000000000000000..b117e8f7d19944da392a8c9ebc26587205286691 --- /dev/null +++ b/DATASET_PACSUM/dataset/inputs/C96-2183.txt @@ -0,0 +1 @@ +structure of the sentence, to identify the <:omponents o be separated out. obviously a parser couhl be used to obtain the complete structure of the sentence. ][owever, hill parsing is slow a+nd i)rone to fa.ilure, especially on <:omph!x sentences. in this l)aper, we consider two alternatives to fu]l parsing which could be use