alwaysaditi commited on Jun 21

Commit

99f5521

•

1 Parent(s): 7d1289c

End of training

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.config/.last_opt_in_prompt.yaml +1 -0
.config/.last_survey_prompt.yaml +1 -0
.config/.last_update_check.json +1 -0
.config/active_config +1 -0
.config/config_sentinel +0 -0
.config/configurations/config_default +6 -0
.config/default_configs.db +0 -0
.config/gce +1 -0
.config/logs/2024.06.18/13.22.38.097292.log +534 -0
.config/logs/2024.06.18/13.23.02.197770.log +5 -0
.config/logs/2024.06.18/13.23.12.081812.log +169 -0
.config/logs/2024.06.18/13.23.20.359666.log +5 -0
.config/logs/2024.06.18/13.23.30.494468.log +8 -0
.config/logs/2024.06.18/13.23.31.099704.log +8 -0
.gitattributes +2 -0
README.md +107 -0
config.json +63 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-1031.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-1043.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2004.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2009.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2018.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2019.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2024.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2026.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2030.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2031.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2034.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A88-1019.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A92-1006.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A92-1018.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A92-1021.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A94-1006.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A94-1009.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A94-1016.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1004.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1011.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1014.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1029.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1030.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1039.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1052.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-1007.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-1044.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-1072.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-2136.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-2137.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-2163.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C02-1011.txt +1 -0
drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C02-1054.txt +1 -0

.config/.last_opt_in_prompt.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

.config/.last_survey_prompt.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ last_prompt_time: 1718716991.5380163

.config/.last_update_check.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"last_update_check_time": 1718716999.9053707, "last_update_check_revision": 20240607152945, "notifications": [], "last_nag_times": {}}

.config/active_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ default

.config/config_sentinel ADDED Viewed

File without changes

.config/configurations/config_default ADDED Viewed

	@@ -0,0 +1,6 @@

+[component_manager]
+disable_update_check = true
+[compute]
+gce_metadata_read_timeout_sec = 0

.config/default_configs.db ADDED Viewed

Binary file (12.3 kB). View file

.config/gce ADDED Viewed

	@@ -0,0 +1 @@


1	+ False

.config/logs/2024.06.18/13.22.38.097292.log ADDED Viewed

	@@ -0,0 +1,534 @@

+2024-06-18 13:22:50,123 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2024-06-18 13:22:50,127 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'update']
+2024-06-18 13:22:50,129 DEBUG    root            Running [gcloud.components.update] with arguments: [--allow-no-backup: "True", --compile-python: "True", --quiet: "True", COMPONENT-IDS:6: "['core', 'gcloud-deps', 'bq', 'gcloud', 'gcloud-crc32c', 'gsutil']"]
+2024-06-18 13:22:50,130 INFO     ___FILE_ONLY___ Beginning update. This process may take several minutes.
+2024-06-18 13:22:50,152 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:22:50,289 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components-2.json HTTP/1.1" 200 222658
+2024-06-18 13:22:50,306 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,307 INFO     ___FILE_ONLY___
+Your current Google Cloud CLI version is: 480.0.0
+2024-06-18 13:22:50,307 INFO     ___FILE_ONLY___ Installing components from version: 480.0.0
+2024-06-18 13:22:50,307 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,307 DEBUG    root            Chosen display Format:table[box,title="These components will be removed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2024-06-18 13:22:50,308 DEBUG    root            Chosen display Format:table[box,title="These components will be updated."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2024-06-18 13:22:50,309 DEBUG    root            Chosen display Format:table[box,title="These components will be installed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2024-06-18 13:22:50,435 INFO     ___FILE_ONLY___ ┌─────────────────────────────────────────────────────────────────────────────┐
+2024-06-18 13:22:50,435 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,435 INFO     ___FILE_ONLY___ │                     These components will be installed.                     │
+2024-06-18 13:22:50,435 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___ ├─────────────────────────────────────────────────────┬────────────┬──────────┤
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___ │                         Name                        │  Version   │   Size   │
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___ ├─────────────────────────────────────────────────────┼────────────┼──────────┤
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___ BigQuery Command Line Tool
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,436 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___      2.1.5
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___  1.7 MiB
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___ BigQuery Command Line Tool (Platform Specific)
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,437 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___    2.0.101
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___  < 1 MiB
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___ Bundled Python 3.11
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,438 INFO     ___FILE_ONLY___     3.11.8
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___ 75.1 MiB
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___ Cloud Storage Command Line Tool
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,439 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___       5.29
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ 11.3 MiB
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ Cloud Storage Command Line Tool (Platform Specific)
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___       5.27
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,440 INFO     ___FILE_ONLY___  < 1 MiB
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___ Google Cloud CLI Core Libraries (Platform Specific)
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___ 2024.01.06
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___  < 1 MiB
+2024-06-18 13:22:50,441 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___ Google Cloud CRC32C Hash Tool
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___      1.0.0
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___  1.2 MiB
+2024-06-18 13:22:50,442 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___ gcloud cli dependencies
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___ 2021.04.16
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,443 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___  < 1 MiB
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___ │
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___ └─────────────────────────────────────────────────────┴────────────┴──────────┘
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,444 INFO     ___FILE_ONLY___
+2024-06-18 13:22:50,448 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:22:50,587 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/RELEASE_NOTES HTTP/1.1" 200 1228039
+2024-06-18 13:22:50,621 INFO     ___FILE_ONLY___ For the latest full release notes, please visit:
+  https://cloud.google.com/sdk/release_notes
+2024-06-18 13:22:50,623 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:22:50,623 INFO     ___FILE_ONLY___ ╠═ Creating update staging area                             ═╣
+2024-06-18 13:22:50,624 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:22:50,624 INFO     ___FILE_ONLY___ ══════
+2024-06-18 13:22:50,624 INFO     ___FILE_ONLY___ ══════
+2024-06-18 13:22:50,624 INFO     ___FILE_ONLY___ ══════
+2024-06-18 13:22:50,838 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:50,899 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:50,958 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,025 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,082 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,241 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,315 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,362 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,413 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,462 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,575 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,625 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,673 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,729 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,788 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,842 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,894 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:51,955 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,015 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,093 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,153 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,211 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,271 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,334 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,396 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,452 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,510 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,564 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,624 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,662 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,705 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,738 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,779 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,814 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,848 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,884 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,920 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:52,980 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,063 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,206 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,367 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,367 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:22:53,434 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:22:53,434 INFO     ___FILE_ONLY___ ╠═ Installing: BigQuery Command Line Tool                   ═╣
+2024-06-18 13:22:53,434 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:22:53,438 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:22:53,511 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bq-20240524155722.tar.gz HTTP/1.1" 200 1789662
+2024-06-18 13:22:53,522 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,522 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,522 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,522 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,523 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,525 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,526 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,526 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,526 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,526 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,651 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,656 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,661 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,666 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,670 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,674 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,678 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,682 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,688 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,692 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,696 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,700 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,704 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,710 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,713 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,718 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,725 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,728 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,735 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,740 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,745 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,750 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,754 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,758 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,763 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,766 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,770 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,774 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,778 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,783 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:53,783 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:22:53,798 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:22:53,798 INFO     ___FILE_ONLY___ ╠═ Installing: BigQuery Command Line Tool (Platform Spec... ═╣
+2024-06-18 13:22:53,798 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:22:53,802 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:22:53,934 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bq-nix-20240106004423.tar.gz HTTP/1.1" 200 2026
+2024-06-18 13:22:53,935 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:22:53,936 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:22:53,936 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:22:53,944 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:22:53,944 INFO     ___FILE_ONLY___ ╠═ Installing: Bundled Python 3.11                          ═╣
+2024-06-18 13:22:53,944 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:22:53,949 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2024-06-18 13:22:53,949 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:22:53,951 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:22:53,951 INFO     ___FILE_ONLY___ ╠═ Installing: Bundled Python 3.11                          ═╣
+2024-06-18 13:22:53,951 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:22:53,955 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:22:54,092 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bundled-python3-unix-linux-x86_64-20240510142152.tar.gz HTTP/1.1" 200 78697278
+2024-06-18 13:22:54,359 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,362 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,365 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,368 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,371 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,374 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,376 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,379 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,382 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,385 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,388 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,390 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,393 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,396 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,398 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,401 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,404 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,407 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,410 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,412 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,415 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,418 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,421 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,423 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,426 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,429 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,432 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,435 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,438 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:54,441 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,650 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,677 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,703 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,729 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,754 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,779 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,803 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,828 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,853 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,878 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,903 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,928 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,952 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:56,977 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,002 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,027 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,054 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,490 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,531 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,587 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,626 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,780 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,918 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,957 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:57,998 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:58,067 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:58,103 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:58,148 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,274 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,309 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,309 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:22:59,390 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:22:59,390 INFO     ___FILE_ONLY___ ╠═ Installing: Cloud Storage Command Line Tool              ═╣
+2024-06-18 13:22:59,391 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:22:59,395 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:22:59,536 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gsutil-20240510142152.tar.gz HTTP/1.1" 200 11893574
+2024-06-18 13:22:59,574 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,575 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,576 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,576 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,577 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,577 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,578 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,578 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,579 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,579 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,580 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,580 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,581 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,581 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,582 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,583 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,583 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,584 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,584 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,585 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,585 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,586 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,586 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,587 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,587 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,588 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,588 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,589 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,589 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:22:59,590 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,293 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,330 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,357 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,387 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,415 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,439 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,461 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,483 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,502 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,524 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,548 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,581 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,614 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,651 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,676 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,696 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,718 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,742 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,769 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,788 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,810 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,835 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,858 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,878 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,902 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,926 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:00,977 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,005 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,037 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,060 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,060 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,114 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,114 INFO     ___FILE_ONLY___ ╠═ Installing: Cloud Storage Command Line Tool (Platform... ═╣
+2024-06-18 13:23:01,115 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,118 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:01,251 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gsutil-nix-20240106004423.tar.gz HTTP/1.1" 200 2042
+2024-06-18 13:23:01,252 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,253 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,253 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,262 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,262 INFO     ___FILE_ONLY___ ╠═ Installing: Default set of gcloud commands               ═╣
+2024-06-18 13:23:01,262 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,266 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2024-06-18 13:23:01,266 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,268 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,269 INFO     ___FILE_ONLY___ ╠═ Installing: Google Cloud CLI Core Libraries (Platform... ═╣
+2024-06-18 13:23:01,269 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,272 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:01,408 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-core-nix-20240106004423.tar.gz HTTP/1.1" 200 2410
+2024-06-18 13:23:01,409 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,410 INFO     ___FILE_ONLY___ ═══════════════
+2024-06-18 13:23:01,411 INFO     ___FILE_ONLY___ ═══════════════
+2024-06-18 13:23:01,411 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,419 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,419 INFO     ___FILE_ONLY___ ╠═ Installing: Google Cloud CRC32C Hash Tool                ═╣
+2024-06-18 13:23:01,419 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,423 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:01,557 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gcloud-crc32c-linux-x86_64-20231215195722.tar.gz HTTP/1.1" 200 1287877
+2024-06-18 13:23:01,567 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,567 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,567 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,567 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,567 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,567 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,568 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,569 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,570 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:01,603 INFO     ___FILE_ONLY___ ═══════════════
+2024-06-18 13:23:01,604 INFO     ___FILE_ONLY___ ═══════════════
+2024-06-18 13:23:01,604 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,612 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,612 INFO     ___FILE_ONLY___ ╠═ Installing: Google Cloud CRC32C Hash Tool                ═╣
+2024-06-18 13:23:01,612 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,617 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2024-06-18 13:23:01,617 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,619 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,619 INFO     ___FILE_ONLY___ ╠═ Installing: gcloud cli dependencies                      ═╣
+2024-06-18 13:23:01,619 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,622 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:01,754 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gcloud-deps-linux-x86_64-20210416153011.tar.gz HTTP/1.1" 200 104
+2024-06-18 13:23:01,755 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,755 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,755 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,763 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:01,763 INFO     ___FILE_ONLY___ ╠═ Creating backup and activating new installation          ═╣
+2024-06-18 13:23:01,763 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:01,763 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk] to [/tools/google-cloud-sdk.staging/.install/.backup]
+2024-06-18 13:23:01,763 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,763 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk.staging] to [/tools/google-cloud-sdk]
+2024-06-18 13:23:01,763 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:01,764 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:01,767 DEBUG    root            Updating notification cache...
+2024-06-18 13:23:01,767 INFO     ___FILE_ONLY___
+2024-06-18 13:23:01,769 INFO     ___FILE_ONLY___ Performing post processing steps...
+2024-06-18 13:23:01,769 DEBUG    root            Executing command: ['/tools/google-cloud-sdk/bin/gcloud', 'components', 'post-process']
+2024-06-18 13:23:11,510 DEBUG    ___FILE_ONLY___
+2024-06-18 13:23:11,510 DEBUG    ___FILE_ONLY___
+2024-06-18 13:23:11,533 INFO     ___FILE_ONLY___
+Update done!
+2024-06-18 13:23:11,536 DEBUG    root            Chosen display Format:none
+2024-06-18 13:23:11,537 INFO     root            Display format: "none"

.config/logs/2024.06.18/13.23.02.197770.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2024-06-18 13:23:02,198 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2024-06-18 13:23:02,201 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'post_process']
+2024-06-18 13:23:02,203 DEBUG    root            Running [gcloud.components.post-process] with arguments: []
+2024-06-18 13:23:11,427 DEBUG    root            Chosen display Format:none
+2024-06-18 13:23:11,428 INFO     root            Display format: "none"

.config/logs/2024.06.18/13.23.12.081812.log ADDED Viewed

	@@ -0,0 +1,169 @@

+2024-06-18 13:23:12,082 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2024-06-18 13:23:12,085 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'update']
+2024-06-18 13:23:12,087 DEBUG    root            Running [gcloud.components.update] with arguments: [--quiet: "True", COMPONENT-IDS:8: "['gcloud', 'core', 'bq', 'gsutil', 'compute', 'preview', 'alpha', 'beta']"]
+2024-06-18 13:23:12,088 INFO     ___FILE_ONLY___ Beginning update. This process may take several minutes.
+2024-06-18 13:23:12,096 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:12,229 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components-2.json HTTP/1.1" 200 222658
+2024-06-18 13:23:12,247 WARNING  root            Component [compute] no longer exists.
+2024-06-18 13:23:12,248 WARNING  root            Component [preview] no longer exists.
+2024-06-18 13:23:12,248 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,249 INFO     ___FILE_ONLY___
+Your current Google Cloud CLI version is: 480.0.0
+2024-06-18 13:23:12,249 INFO     ___FILE_ONLY___ Installing components from version: 480.0.0
+2024-06-18 13:23:12,249 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,249 DEBUG    root            Chosen display Format:table[box,title="These components will be removed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2024-06-18 13:23:12,250 DEBUG    root            Chosen display Format:table[box,title="These components will be updated."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2024-06-18 13:23:12,251 DEBUG    root            Chosen display Format:table[box,title="These components will be installed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2024-06-18 13:23:12,285 INFO     ___FILE_ONLY___ ┌──────────────────────────────────────────────┐
+2024-06-18 13:23:12,285 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___ │     These components will be installed.      │
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___ ├───────────────────────┬────────────┬─────────┤
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___ │          Name         │  Version   │   Size  │
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___ ├───────────────────────┼────────────┼─────────┤
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,286 INFO     ___FILE_ONLY___ gcloud Alpha Commands
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ 2024.06.07
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ < 1 MiB
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ gcloud Beta Commands
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,287 INFO     ___FILE_ONLY___ 2024.06.07
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___ < 1 MiB
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___ │
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___ └───────────────────────┴────────────┴─────────┘
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,288 INFO     ___FILE_ONLY___
+2024-06-18 13:23:12,292 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:12,431 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/RELEASE_NOTES HTTP/1.1" 200 1228039
+2024-06-18 13:23:12,468 INFO     ___FILE_ONLY___ For the latest full release notes, please visit:
+  https://cloud.google.com/sdk/release_notes
+2024-06-18 13:23:12,470 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:12,470 INFO     ___FILE_ONLY___ ╠═ Creating update staging area                             ═╣
+2024-06-18 13:23:12,470 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:12,470 INFO     ___FILE_ONLY___ ══════
+2024-06-18 13:23:13,082 INFO     ___FILE_ONLY___ ══════
+2024-06-18 13:23:13,082 INFO     ___FILE_ONLY___ ══════
+2024-06-18 13:23:13,370 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,443 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,491 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,551 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,612 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,759 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,808 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:13,884 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,018 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,097 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,178 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,230 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,292 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,347 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,406 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,476 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,577 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,669 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,724 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,784 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,847 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,905 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:14,967 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,033 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,090 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,144 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,250 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,306 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,375 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,441 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,507 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,590 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,683 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,755 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,818 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,878 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:15,940 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:16,011 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:16,067 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:16,136 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:16,193 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:16,253 INFO     ___FILE_ONLY___ ═
+2024-06-18 13:23:16,253 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:19,604 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:19,604 INFO     ___FILE_ONLY___ ╠═ Installing: gcloud Alpha Commands                        ═╣
+2024-06-18 13:23:19,605 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:19,609 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:19,746 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-alpha-20240607152945.tar.gz HTTP/1.1" 200 800
+2024-06-18 13:23:19,747 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:19,749 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:19,749 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:19,756 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:19,757 INFO     ___FILE_ONLY___ ╠═ Installing: gcloud Beta Commands                         ═╣
+2024-06-18 13:23:19,757 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:19,760 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2024-06-18 13:23:19,891 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-beta-20240607152945.tar.gz HTTP/1.1" 200 797
+2024-06-18 13:23:19,892 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:19,893 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:19,893 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:19,900 INFO     ___FILE_ONLY___ ��════════════════════════════════════════════════════════════╗
+2024-06-18 13:23:19,901 INFO     ___FILE_ONLY___ ╠═ Creating backup and activating new installation          ═╣
+2024-06-18 13:23:19,901 INFO     ___FILE_ONLY___ ╚
+2024-06-18 13:23:19,901 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk] to [/tools/google-cloud-sdk.staging/.install/.backup]
+2024-06-18 13:23:19,901 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:19,901 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk.staging] to [/tools/google-cloud-sdk]
+2024-06-18 13:23:19,901 INFO     ___FILE_ONLY___ ══════════════════════════════
+2024-06-18 13:23:19,901 INFO     ___FILE_ONLY___ ╝
+2024-06-18 13:23:19,905 DEBUG    root            Updating notification cache...
+2024-06-18 13:23:19,905 INFO     ___FILE_ONLY___
+2024-06-18 13:23:19,907 INFO     ___FILE_ONLY___ Performing post processing steps...
+2024-06-18 13:23:19,908 DEBUG    root            Executing command: ['/tools/google-cloud-sdk/bin/gcloud', 'components', 'post-process']
+2024-06-18 13:23:29,718 DEBUG    ___FILE_ONLY___
+2024-06-18 13:23:29,718 DEBUG    ___FILE_ONLY___
+2024-06-18 13:23:29,922 INFO     ___FILE_ONLY___
+Update done!
+2024-06-18 13:23:29,925 DEBUG    root            Chosen display Format:none
+2024-06-18 13:23:29,925 INFO     root            Display format: "none"

.config/logs/2024.06.18/13.23.20.359666.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2024-06-18 13:23:20,360 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2024-06-18 13:23:20,362 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'post_process']
+2024-06-18 13:23:20,364 DEBUG    root            Running [gcloud.components.post-process] with arguments: []
+2024-06-18 13:23:29,624 DEBUG    root            Chosen display Format:none
+2024-06-18 13:23:29,625 INFO     root            Display format: "none"

.config/logs/2024.06.18/13.23.30.494468.log ADDED Viewed

	@@ -0,0 +1,8 @@

+2024-06-18 13:23:30,496 DEBUG    root            Loaded Command Group: ['gcloud', 'config']
+2024-06-18 13:23:30,546 DEBUG    root            Loaded Command Group: ['gcloud', 'config', 'set']
+2024-06-18 13:23:30,548 DEBUG    root            Running [gcloud.config.set] with arguments: [SECTION/PROPERTY: "component_manager/disable_update_check", VALUE: "true"]
+2024-06-18 13:23:30,549 INFO     ___FILE_ONLY___ Updated property [component_manager/disable_update_check].
+2024-06-18 13:23:30,550 DEBUG    root            Chosen display Format:default
+2024-06-18 13:23:30,551 INFO     root            Display format: "default"
+2024-06-18 13:23:30,551 DEBUG    root            SDK update checks are disabled.

.config/logs/2024.06.18/13.23.31.099704.log ADDED Viewed

	@@ -0,0 +1,8 @@

+2024-06-18 13:23:31,101 DEBUG    root            Loaded Command Group: ['gcloud', 'config']
+2024-06-18 13:23:31,154 DEBUG    root            Loaded Command Group: ['gcloud', 'config', 'set']
+2024-06-18 13:23:31,157 DEBUG    root            Running [gcloud.config.set] with arguments: [SECTION/PROPERTY: "compute/gce_metadata_read_timeout_sec", VALUE: "0"]
+2024-06-18 13:23:31,158 INFO     ___FILE_ONLY___ Updated property [compute/gce_metadata_read_timeout_sec].
+2024-06-18 13:23:31,159 DEBUG    root            Chosen display Format:default
+2024-06-18 13:23:31,160 INFO     root            Display format: "default"
+2024-06-18 13:23:31,161 DEBUG    root            SDK update checks are disabled.

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample_data/mnist_test.csv filter=lfs diff=lfs merge=lfs -text
+sample_data/mnist_train_small.csv filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+base_model: google/pegasus-x-base
+tags:
+- generated_from_trainer
+model-index:
+- name: google/pegasus-x-base
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# google/pegasus-x-base
+This model is a fine-tuned version of [google/pegasus-x-base](https://huggingface.co/google/pegasus-x-base) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.0135
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 5
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 8.9092        | 0.1008 | 10   | 8.5348          |
+| 7.9162        | 0.2015 | 20   | 7.5592          |
+| 7.3907        | 0.3023 | 30   | 6.9080          |
+| 6.8587        | 0.4030 | 40   | 6.1464          |
+| 5.7817        | 0.5038 | 50   | 5.2883          |
+| 5.0792        | 0.6045 | 60   | 3.9477          |
+| 4.1259        | 0.7053 | 70   | 2.7538          |
+| 3.0821        | 0.8060 | 80   | 1.7983          |
+| 2.2714        | 0.9068 | 90   | 1.4814          |
+| 1.7994        | 1.0076 | 100  | 1.4092          |
+| 1.4936        | 1.1083 | 110  | 1.3189          |
+| 1.6535        | 1.2091 | 120  | 1.2445          |
+| 1.3122        | 1.3098 | 130  | 1.2139          |
+| 1.0667        | 1.4106 | 140  | 1.1800          |
+| 1.274         | 1.5113 | 150  | 1.1507          |
+| 1.1739        | 1.6121 | 160  | 1.1279          |
+| 1.1871        | 1.7128 | 170  | 1.1094          |
+| 1.2037        | 1.8136 | 180  | 1.0973          |
+| 1.0839        | 1.9144 | 190  | 1.0832          |
+| 1.0738        | 2.0151 | 200  | 1.0752          |
+| 1.0955        | 2.1159 | 210  | 1.0695          |
+| 1.1285        | 2.2166 | 220  | 1.0629          |
+| 0.9973        | 2.3174 | 230  | 1.0574          |
+| 1.0522        | 2.4181 | 240  | 1.0557          |
+| 1.0803        | 2.5189 | 250  | 1.0458          |
+| 1.0707        | 2.6196 | 260  | 1.0425          |
+| 1.1868        | 2.7204 | 270  | 1.0384          |
+| 1.0117        | 2.8212 | 280  | 1.0374          |
+| 0.9206        | 2.9219 | 290  | 1.0347          |
+| 1.0099        | 3.0227 | 300  | 1.0306          |
+| 1.0459        | 3.1234 | 310  | 1.0307          |
+| 1.0721        | 3.2242 | 320  | 1.0313          |
+| 1.015         | 3.3249 | 330  | 1.0278          |
+| 1.0358        | 3.4257 | 340  | 1.0237          |
+| 0.9608        | 3.5264 | 350  | 1.0206          |
+| 1.0416        | 3.6272 | 360  | 1.0202          |
+| 0.9304        | 3.7280 | 370  | 1.0201          |
+| 1.0447        | 3.8287 | 380  | 1.0187          |
+| 1.0007        | 3.9295 | 390  | 1.0180          |
+| 1.1681        | 4.0302 | 400  | 1.0168          |
+| 1.0258        | 4.1310 | 410  | 1.0163          |
+| 1.1054        | 4.2317 | 420  | 1.0153          |
+| 0.907         | 4.3325 | 430  | 1.0154          |
+| 0.935         | 4.4332 | 440  | 1.0151          |
+| 0.9904        | 4.5340 | 450  | 1.0145          |
+| 0.9735        | 4.6348 | 460  | 1.0142          |
+| 0.9633        | 4.7355 | 470  | 1.0138          |
+| 1.2809        | 4.8363 | 480  | 1.0136          |
+| 1.0361        | 4.9370 | 490  | 1.0135          |
+### Framework versions
+- Transformers 4.41.2
+- Pytorch 2.3.0+cu121
+- Datasets 2.20.0
+- Tokenizers 0.19.1

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "_name_or_path": "google/pegasus-x-base",
+  "activation_dropout": 0.1,
+  "activation_function": "relu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": true,
+  "architectures": [
+    "PegasusXForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "block_size": 512,
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 0,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 1,
+  "extra_pos_embeddings": 1,
+  "force_bos_token_to_be_generated": false,
+  "forced_eos_token_id": 1,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "length_penalty": 2.0,
+  "max_length": 512,
+  "max_position_embeddings": 16384,
+  "min_length": 100,
+  "model_type": "pegasus_x",
+  "no_repeat_ngram_size": 3,
+  "normalize_before": true,
+  "normalize_embedding": false,
+  "num_beams": 8,
+  "num_global_tokens": 128,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "scale_embedding": true,
+  "stagger_local_blocks": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "vocab_size": 96103
+}

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-1031.txt ADDED Viewed

	@@ -0,0 +1 @@

+ a large number of current language processing systems use a part-of-speech tagger for pre-processing. the tagger assigns a (unique or ambiguous) part-ofspeech tag to each token in the input and passes its output to the next processing level, usually a parser. furthermore, there is a large interest in part-ofspeech tagging for corpus annotation projects, who create valuable linguistic resources by a combination of automatic processing and human correction. for both applications, a tagger with the highest possible accuracy is required. the debate about which paradigm solves the part-of-speech tagging problem best is not finished. recent comparisons of approaches that can be trained on corpora (van halteren et al., 1998; volk and schneider, 1998) have shown that in most cases statistical aproaches (cutting et al., 1992; schmid, 1995; ratnaparkhi, 1996) yield better results than finite-state, rule-based, or memory-based taggers (brill, 1993; daelemans et al., 1996). they are only surpassed by combinations of different systems, forming a "voting tagger". among the statistical approaches, the maximum entropy framework has a very strong position. nevertheless, a recent independent comparison of 7 taggers (zavrel and daelemans, 1999) has shown that another approach even works better: markov models combined with a good smoothing technique and with handling of unknown words. this tagger, tnt, not only yielded the highest accuracy, it also was the fastest both in training and tagging. the tagger comparison was organized as a "blackbox test": set the same task to every tagger and compare the outcomes. this paper describes the models and techniques used by tnt together with the implementation. the reader will be surprised how simple the underlying model is. the result of the tagger comparison seems to support the maxime "the simplest is the best". however, in this paper we clarify a number of details that are omitted in major previous publications concerning tagging with markov models. as two examples, (rabiner, 1989) and (charniak et al., 1993) give good overviews of the techniques and equations used for markov models and part-ofspeech tagging, but they are not very explicit in the details that are needed for their application. we argue that it is not only the choice of the general model that determines the result of the tagger but also the various "small" decisions on alternatives. the aim of this paper is to give a detailed account of the techniques used in tnt. additionally, we present results of the tagger on the negra corpus (brants et al., 1999) and the penn treebank (marcus et al., 1993). the penn treebank results reported here for the markov model approach are at least equivalent to those reported for the maximum entropy approach in (ratnaparkhi, 1996). for a comparison to other taggers, the reader is referred to (zavrel and daelemans, 1999).we have shown that a tagger based on markov models yields state-of-the-art results, despite contrary claims found in the literature. for a comparison to other taggers, the reader is referred to (zavrel and daelemans, 1999). a large number of current language processing systems use a part-of-speech tagger for pre-processing. tnt is freely available to universities and related organizations for research purposes (see http://www.coli.uni-sb.derthorstenant). the penn treebank results reported here for the markov model approach are at least equivalent to those reported for the maximum entropy approach in (ratnaparkhi, 1996). the tagger assigns a (unique or ambiguous) part-ofspeech tag to each token in the input and passes its output to the next processing level, usually a parser. additionally, we present results of the tagger on the negra corpus (brants et al., 1999) and the penn treebank (marcus et al., 1993). it is a very interesting future research topic to determine the advantages of either of these approaches, to find the reason for their high accuracies, and to find a good combination of both. furthermore, there is a large interest in part-ofspeech tagging for corpus annotation projects, who create valuable linguistic resources by a combination of automatic processing and human correction. for example, the markov model tagger used in the comparison of (van halteren et al., 1998) yielded worse results than all other taggers.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-1043.txt ADDED Viewed

	@@ -0,0 +1 @@

+ current automatic summarizers usually rely on sentence extraction to produce summaries. human professionals also often reuse the input documents to generate summaries; however, rather than simply extracting sentences and stringing them together, as most current summarizers do, humans often "edit" the extracted sentences in some way so that the resulting summary is concise and coherent. we analyzed a set of articles and identified six major operations that can be used for editing the extracted sentences, including removing extraneous phrases from an extracted sentence, combining a reduced sentence with other sentences, syntactic transformation, substituting phrases in an extracted sentence with their paraphrases, substituting phrases with more general or specific descriptions, and reordering the extracted sentences (jing and mckeown, 1999; jing and mckeown, 2000). we call the operation of removing extraneous phrases from an extracted sentence sentence reduction. it is one of the most effective operations that can be used to edit the extracted sentences. reduction can remove material at any granularity: a word, a prepositional phrase, a gerund, a to-infinitive or a clause. we use the term "phrase" here to refer to any of the above components that can be removed in reduction. the following example shows an original sentence and its reduced form written by a human professional: original sentence: when it arrives sometime next year in new tv sets, the v-chip will give parents a new and potentially revolutionary device to block out programs they don't want their children to see. reduced sentence by humans: the v-chip will give parents a device to block out programs they don't want their children to see. we implemented an automatic sentence reduction system. input to the reduction system includes extracted sentences, as well as the original document. output of reduction are reduced forms of the extracted sentences, which can either be used to produce summaries directly, or be merged with other sentences. the reduction system uses multiple sources of knowledge to make reduction decisions, including syntactic knowledge, context, and statistics computed from a training corpus. we evaluated the system against the output of human professionals. the program achieved a success rate of 81.3%, meaning that 81.3% of reduction decisions made by the system agreed with those of humans. sentence reduction improves the conciseness of automatically generated summaries, making it concise and on target. it can also improve the coherence of generated summaries, since extraneous phrases that can potentially introduce incoherece are removed. we collected 500 sentences and their corresponding reduced forms written by humans, and found that humans reduced the length of these 500 sentences by 44.2% on average. this indicates that a good sentence reduction system can improve the conciseness of generated summaries significantly. in the next section, we describe the sentence reduction algorithm in details. in section 3, we introduce the evaluation scheme used to access the performance of the system and present evaluation results. in section 4, we discuss other applications of sentence reduction, the interaction between reduction and other modules in a summarization system, and related work on sentence simplication. finally, we the goal of sentence reduction is to "reduce without major loss"; that is, we want to remove as many extraneous phrases as possible from an extracted sentence so that it can be concise, but without detracting from the main idea the sentence conveys. ideally, we want to remove a phrase from an extracted sentence only if it is irrelevant to the main topic. to achieve this, the system relies on multiple sources of knowledge to make reduction decisions. we first introduce the resources in the system and then describe the reduction algorithm. (1) the corpus. one of the key features of the system is that it uses a corpus consisting of original sentences and their corresponding reduced forms written by humans for training and testing purpose. this corpus was created using an automatic program we have developed to automatically analyze human-written abstracts. the program, called the decomposition program, matches phrases in a human-written summary sentence to phrases in the original document (jing and mckeown, 1999). the human-written abstracts were collected from the free daily news service "communicationsrelated headlines", provided by the benton foundation (http://www.benton.org). the articles in the corpus are news reports on telecommunication related issues, but they cover a wide range of topics, such as law, labor, and company mergers. database to date. it provides lexical relations between words, including synonymy, antonymy, meronymy, entailment (e.g., eat —> chew), or causation (e.g., kill --* die). these lexical links are used to identify the focus in the local context. (4) the syntactic parser. we use the english slot grammar(esg) parser developed at ibm (mccord, 1990) to analyze the syntactic structure of an input sentence and produce a sentence parse tree. the esg parser not only annotates the syntactic category of a phrase (e.g., "np" or "vp"), it also annotates the thematic role of a phrase (e.g., "subject" or "object"). there are five steps in the reduction program: step 1: syntactic parsing. we first parse the input sentence using the esg parser and produce the sentence parse tree. the operations in all other steps are performed based on this parse tree. each following step annotates each node in the parse tree with additional information, such as syntactic or context importance, which are used later to determine which phrases (they are represented as subtrees in a parse tree) can be considered extraneous and thus removed. step 2: grammar checking. in this step, we determine which components of a sentence must not be deleted to keep the sentence grammatical. to do this, we traverse the parse tree produced in the first step in top-down order and mark, for each node in the parse tree, which of its children are grammatically obligatory. we use two sources of knowledge for this purpose. one source includes simple, linguistic-based rules that use the thematic role structure produced by the esg parser. for instance, for a sentence, the main verb, the subject, and the object(s) are essential if they exist, but a prepositional phrase is not; for a noun phrase, the head noun is essential, but an adjective modifier of the head noun is not. the other source we rely on is the large-scale lexicon we described earlier. the information in the lexicon is used to mark the obligatory arguments of verb phrases. for example, for the verb "convince", the lexicon has the following entry: this entry indicates that the verb "convince" can be followed by a noun phrase and a prepositional phrase starting with the preposition "of' (e.g., he convinced me of his innocence). it can also be followed by a noun phrase and a to-infinitive phrase (e.g., he convinced me to go to the party). this information prevents the system from deleting the "of" prepositional phrase or the to-infinitive that is part of the verb phrase. at the end of this step, each node in the parse tree — including both leaf nodes and intermediate nodes — is annotated with a value indicating whether it is grammatically obligatory. note that whether a node is obligatory is relative to its parent node only. for example, whether a determiner is obligatory is relative to the noun phrase it is in; whether a prepositional phrase is obligatory is relative to the sentence or the phrase it is in. step 3: context information. in this step, the system decides which components in the sentence are most related to the main topic being discussed. to measure the importance of a phrase in the local context, the system relies on lexical links between words. the hypothesis is that the more connected a word is with other words in the local context, the more likely it is to be the focus of the local context. we link the words in the extracted sentence with words in its local context, if they are repetitions, morphologically related, or linked in wordnet through one of the lexical relations. the system then computes an importance score for each word in the extracted sentence, based on the number of links it has with other words and the types of links. the formula for computing the context importance score for a word w is as follows: here, i represents the different types of lexical relations the system considered, including repetition, inflectional relation, derivational relation, and the lexical relations from wordnet. we assigned a weight to each type of lexical relation, represented by li in the formula. relations such as repetition or inflectional relation are considered more important and are assigned higher weights, while relations such as hypernym are considered less important and assigned lower weights. nu (w) in the formula represents the number of a particular type of lexical links the word w has with words in the local context. after an importance score is computed for each word, each phrase in the 'sentence gets a score by adding up the scores of its children nodes in the parse tree. this score indicates how important the phrase is in the local context. step 4: corpus evidence. the program uses a corpus consisting of sentences reduced by human professionals and their corresponding original sentences to compute how likely humans remove a certain phrase. the system first parsed the sentences in the corpus using esg parser. it then marked which subtrees in these parse trees (i.e., phrases in the sentences) were removed by humans. using this corpus of marked parse trees, we can compute how likely a subtree is removed from its parent node. for example, we can compute the probability that the "when" temporal clause is removed when the main verb is "give", represented as prob("when-clause is removed" i "v=give"), or the probability that the to-infinitive modifier of the head noun "device" is removed, represented as prob("to-infinitive modifier is removed" i"n=device"). these probabilities are computed using bayes's rule. for example, the probability that the "when" temporal clause is removed when the main verb is "give", prob("when-clause is removed" i "v=give"), is computed as the product of prob( "v=give" i "when-clause is removed") (i.e., the probability that the main verb is "give" when the "when" clause is removed) and prob("when-clause is removed") (i.e., the probability that the "when" clause is removed), divided by prob("v=give") (i.e., the probability that the main verb is "give"). besides computing the probability that a phrase is removed, we also compute two other types of probabilities: the probability that a phrase is reduced (i.e., the phrase is not removed as a whole, but some components in the phrase are removed), and the probability that a phrase is unchanged at all (i.e., neither removed nor reduced). these corpus probabilities help us capture human practice. for example, for sentences like "the agency reported that ..." , "the other source says that ..." , "the new study suggests that ..." , the thatclause following the say-verb (i.e., report, say, and suggest) in each sentence is very rarely changed at all by professionals. the system can capture this human practice, since the probability that that-clause of the verb say or report being unchanged at all will be relatively high, which will help the system to avoid removing components in the that-clause. these corpus probabilities are computed beforehand using a training corpus. they are then stored in a table and loaded at running time. step 5: final decision. the final reduction decisions are based on the results from all the earlier steps. to decide which phrases to remove, the system traverses the sentence parse tree, which now have been annotated with different types of information from earlier steps, in the top-down order and decides which subtrees should be removed, reduced or unchanged. a subtree (i.e., a phrase) is removed only if it is not grammatically obligatory, not the focus of the local context (indicated by a low importance score), and has a reasonable probability of being removed by humans. figure 1 shows sample output of the reduction program. the reduced sentences produced by humans are also provided for comparison.current automatic summarizers usually rely on sentence extraction to produce summaries. the reduced sentences produced by humans are also provided for comparison. this material is based upon work supported by the national science foundation under grant no. figure 1 shows sample output of the reduction program. we call the operation of removing extraneous phrases from an extracted sentence sentence reduction. a subtree (i.e., a phrase) is removed only if it is not grammatically obligatory, not the focus of the local context (indicated by a low importance score), and has a reasonable probability of being removed by humans. it is one of the most effective operations that can be used to edit the extracted sentences. reduction can remove material at any granularity: a word, a prepositional phrase, a gerund, a to-infinitive or a clause. to decide which phrases to remove, the system traverses the sentence parse tree, which now have been annotated with different types of information from earlier steps, in the top-down order and decides which subtrees should be removed, reduced or unchanged. the final reduction decisions are based on the results from all the earlier steps. we use the term "phrase" here to refer to any of the above components that can be removed in reduction.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2004.txt ADDED Viewed

	@@ -0,0 +1 @@

+ even moderately long documents typically address several topics or different aspects of the same topic. the aim of linear text segmentation is to discover the topic boundaries. the uses of this procedure include information retrieval (hearst and plaunt, 1993; hearst, 1994; yaari, 1997; reynar, 1999), summarization (reynar, 1998), text understanding, anaphora resolution (kozima, 1993), language modelling (morris and hirst, 1991; beeferman et al., 1997b) and improving document navigation for the visually disabled (choi, 2000). this paper focuses on domain independent methods for segmenting written text. we present a new algorithm that builds on previous work by reynar (reynar, 1998; reynar, 1994). the primary distinction of our method is the use of a ranking scheme and the cosine similarity measure (van rijsbergen, 1979) in formulating the similarity matrix. we propose that the similarity values of short text segments is statistically insignificant. thus, one can only rely on their order, or rank, for clustering.a segmentation algorithm has two key elements, a, clustering strategy and a similarity measure. even moderately long documents typically address several topics or different aspects of the same topic. we would also like to develop a linear time and multi-source version of the algorithm. thus, one can only rely on their order, or rank, for clustering. the significance of our results has been confirmed by both t-test and ks-test. given the quality of an algorithm is task dependent, the following experiments focus on the relative performance. c99, k98 and r98 are all polynomial time algorithms. it would be interesting to compare c99 with the multi-source method described in (beeferman et al., 1999) using the tdt corpus. existing work falls into one of two categories, lexical cohesion methods and multi-source methods (yaari, 1997). our results show divisive clustering (r98) is more precise than sliding window (h94) and lexical chains (k98) for locating topic boundaries. the definition of a topic segment ranges from complete stories (allan et al., 1998) to summaries (ponte and croft, 1997). if one disregards segmentation accuracy, h94 has the best algorithmic performance (linear). the focus is on the segmentation of transcribed spoken text and broadcast news stories where the presentation format and regular cues can be exploited to improve accuracy.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2009.txt ADDED Viewed

	@@ -0,0 +1 @@

+ word sense disambiguation is often cast as a problem in supervised learning, where a disambiguator is induced from a corpus of manually sense—tagged text using methods from statistics or machine learning. these approaches typically represent the context in which each sense—tagged instance of a word occurs with a set of linguistically motivated features. a learning algorithm induces a representative model from these features which is employed as a classifier to perform disambiguation. this paper presents a corpus—based approach that results in high accuracy by combining a number of very simple classifiers into an ensemble that performs disambiguation via a majority vote. this is motivated by the observation that enhancing the feature set or learning algorithm used in a corpus—based approach does not usually improve disambiguation accuracy beyond what can be attained with shallow lexical features and a simple supervised learning algorithm. for example, a naive bayesian classifier (duda and hart, 1973) is based on a blanket assumption about the interactions among features in a sensetagged corpus and does not learn a representative model. despite making such an assumption, this proves to be among the most accurate techniques in comparative studies of corpus—based word sense disambiguation methodologies (e.g., (leacock et al., 1993), (mooney, 1996), (ng and lee, 1996), (pedersen and bruce, 1997)). these studies represent the context in which an ambiguous word occurs with a wide variety of features. however, when the contribution of each type of feature to overall accuracy is analyzed (eg. (ng and lee, 1996)), shallow lexical features such as co—occurrences and collocations prove to be stronger contributors to accuracy than do deeper, linguistically motivated features such as part—of—speech and verb—object relationships. it has also been shown that the combined accuracy of an ensemble of multiple classifiers is often significantly greater than that of any of the individual classifiers that make up the ensemble (e.g., (dietterich, 1997)). in natural language processing, ensemble techniques have been successfully applied to part— of—speech tagging (e.g., (brill and wu, 1998)) and parsing (e.g., (henderson and brill, 1999)). when combined with a history of disambiguation success using shallow lexical features and naive bayesian classifiers, these findings suggest that word sense disambiguation might best be improved by combining the output of a number of such classifiers into an ensemble. this paper begins with an introduction to the naive bayesian classifier. the features used to represent the context in which ambiguous words occur are presented, followed by the method for selecting the classifiers to include in the ensemble. then, the line and interesi data is described. experimental results disambiguating these words with an ensemble of naive bayesian classifiers are shown to rival previously published results. this paper closes with a discussion of the choices made in formulating this methodology and plans for future work.word sense disambiguation is often cast as a problem in supervised learning, where a disambiguator is induced from a corpus of manually senseâ€”tagged text using methods from statistics or machine learning. this paper closes with a discussion of the choices made in formulating this methodology and plans for future work. a preliminary version of this paper appears in (pedersen, 2000). experimental results disambiguating these words with an ensemble of naive bayesian classifiers are shown to rival previously published results. these approaches typically represent the context in which each senseâ€”tagged instance of a word occurs with a set of linguistically motivated features. a naive bayesian classifier assumes that all the feature variables representing a problem are conditionally independent given the value of a classification variable. each of the nine member classifiers votes for the most probable sense given the particular context represented by that classifier; the ensemble disambiguates by assigning the sense that receives a majority of the votes. this work extends ideas that began in collaboration with rebecca bruce and janyce wiebe. this paper shows that word sense disambiguation accuracy can be improved by combining a number of simple classifiers into an ensemble. this approach was evaluated using the widely studied nouns line and interest, which are disambiguated with accuracy of 88% and 89%, which rivals the best previously published results.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2018.txt ADDED Viewed

	@@ -0,0 +1 @@

+ we present a new parser for parsing down to penn tree-bank style parse trees [16] that achieves 90.1% average precision/recall for sentences of length < 40, and 89.5% for sentences of length < 100, when trained and tested on the previously established [5,9,10,15,17] "standard" sections of the wall street journal tree-bank. this represents a 13% decrease in error rate over the best single-parser results on this corpus [9]. following [5,10], our parser is based upon a probabilistic generative model. that is, for all sentences s and all parses 7r, the parser assigns a probability p(s , 7r) = p(r), the equality holding when we restrict consideration to 7r whose yield * this research was supported in part by nsf grant lis sbr 9720368. the author would like to thank mark johnson and all the rest of the brown laboratory for linguistic information processing. is s. then for any s the parser returns the parse ir that maximizes this probability. that is, the parser implements the function arg maxrp(7r s) = arg maxirp(7r, s) = arg maxrp(w). what fundamentally distinguishes probabilistic generative parsers is how they compute p(r), and it is to that topic we turn next.what fundamentally distinguishes probabilistic generative parsers is how they compute p(r), and it is to that topic we turn next. it is to this project that our future parsing work will be devoted. we have presented a lexicalized markov grammar parsing model that achieves (using the now standard training/testing/development sections of the penn treebank) an average precision/recall of 91.1% on sentences of length < 40 and 89.5% on sentences of length < 100. indeed, we initiated this line of work in an attempt to create a parser that would be flexible enough to allow modifications for parsing down to more semantic levels of detail. this corresponds to an error reduction of 13% over the best previously published single parser results on this test set, those of collins [9]. we present a new parser for parsing down to penn tree-bank style parse trees [16] that achieves 90.1% average precision/recall for sentences of length < 40, and 89.5% for sentences of length < 100, when trained and tested on the previously established [5,9,10,15,17] "standard" sections of the wall street journal tree-bank. in the previous sections we have concentrated on the relation of the parser to a maximumentropy approach, the aspect of the parser that is most novel.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2019.txt ADDED Viewed

	@@ -0,0 +1 @@

+ a good indicator of whether a person knows the meaning of a word is the ability to use it appropriately in a sentence (miller and gildea, 1987). much information about usage can be obtained from quite a limited context: choueka and lusignan (1985) found that people can typically recognize the intended sense of a polysemous word by looking at a narrow window of one or two words around it. statistically-based computer programs have been able to do the same with a high level of accuracy (kilgarriff and palmer, 2000). the goal of our work is to automatically identify inappropriate usage of specific vocabulary words in essays by looking at the local contextual cues around a target word. we have developed a statistical system, alek (assessing lexical knowledge), that uses statistical analysis for this purpose. a major objective of this research is to avoid the laborious and costly process of collecting errors (or negative evidence) for each word that we wish to evaluate. instead, we train alek on a general corpus of english and on edited text containing example uses of the target word. the system identifies inappropriate usage based on differences between the word's local context cues in an essay and the models of context it has derived from the corpora of well-formed sentences. a requirement for alek has been that all steps in the process be automated, beyond choosing the words to be tested and assessing the results. once a target word is chosen, preprocessing, building a model of the word's appropriate usage, and identifying usage errors in essays is performed without manual intervention. alek has been developed using the test of english as a foreign language (toefl) administered by the educational testing service. toefl is taken by foreign students who are applying to us undergraduate and graduate-level programs.a good indicator of whether a person knows the meaning of a word is the ability to use it appropriately in a sentence (miller and gildea, 1987). toefl is taken by foreign students who are applying to us undergraduate and graduate-level programs. the problem of error detection does not entail finding similarities to appropriate usage, rather it requires identifying one element among the contextual cues that simply does not fit. approaches to detecting errors by non-native writers typically produce grammars that look for specific expected error types (schneider and mccoy, 1998; park, palmer and washburn, 1997). the unsupervised techniques that we have presented for inferring negative evidence are effective in recognizing grammatical errors in written text. however, its techniques could be incorporated into a grammar checker for native speakers. alek has been developed using the test of english as a foreign language (toefl) administered by the educational testing service. much information about usage can be obtained from quite a limited context: choueka and lusignan (1985) found that people can typically recognize the intended sense of a polysemous word by looking at a narrow window of one or two words around it. under this approach, essays written by esl students are collected and examined for errors.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2024.txt ADDED Viewed

	@@ -0,0 +1 @@

+ there is a big gap between the summaries produced by current automatic summarizers and the abstracts written by human professionals. certainly one factor contributing to this gap is that automatic systems can not always correctly identify the important topics of an article. another factor, however, which has received little attention, is that automatic summarizers have poor text generation techniques. most automatic summarizers rely on extracting key sentences or paragraphs from an article to produce a summary. since the extracted sentences are disconnected in the original article, when they are strung together, the resulting summary can be inconcise, incoherent, and sometimes even misleading. we present a cut and paste based text summarization technique, aimed at reducing the gap between automatically generated summaries and human-written abstracts. rather than focusing on how to identify key sentences, as do other researchers, we study how to generate the text of a summary once key sentences have been extracted. the main idea of cut and paste summarization is to reuse the text in an article to generate the summary. however, instead of simply extracting sentences as current summarizers do, the cut and paste system will "smooth" the extracted sentences by editing them. such edits mainly involve cutting phrases and pasting them together in novel ways. the key features of this work are:there is a big gap between the summaries produced by current automatic summarizers and the abstracts written by human professionals. the key features of this work are: finally, we conclude and discuss future work. this paper presents a novel architecture for text summarization using cut and paste techniques observed in human-written abstracts. we thank ibm for licensing us the esg parser and the mitre corporation for licensing us the coreference resolution system. we will also extend the system to query-based summarization and investigate whether the system can be modified for multiple document summarization. however, the combination operations and combination rules that we derived from corpus analysis are significantly different from those used in the above system, which mostly came from operations in traditional natural language generation. any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the national science foundation. we identified six operations that can be used alone or together to transform extracted sentences into sentences in human-written abstracts. ing operations. we defined six operations that can be used alone, sequentially, or simultaneously to transform selected sentences from an article into the corresponding summary sentences in its human-written abstract:

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2026.txt ADDED Viewed

	@@ -0,0 +1 @@

+ this paper presents three trainable systems for surface natural language generation (nlg). surface nlg, for our purposes, consists of generating a grammatical natural language phrase that expresses the meaning of an input semantic representation. the systems take a "corpus-based" or "machinelearning" approach to surface nlg, and learn to generate phrases from semantic input by statistically analyzing examples of phrases and their corresponding semantic representations. the determination of the content in the semantic representation, or "deep" generation, is not discussed here. instead, the systems assume that the input semantic representation is fixed and only deal with how to express it in natural language. this paper discusses previous approaches to surface nlg, and introduces three trainable systems for surface nlg, called nlg1, nlg2, and nlg3. quantitative evaluation of experiments in the air travel domain will also be discussed.this paper presents three trainable systems for surface natural language generation (nlg). this paper presents the first systems (known to the author) that use a statistical learning approach to produce natural language text directly from a semantic representation. we conjecture that nlg2 and nlg3 should work in other domains which have a complexity similar to air travel, as well as available annotated data. quantitative evaluation of experiments in the air travel domain will also be discussed. the nlg2 and nlg3 systems automatically attempt to generalize from the knowledge inherent in the training corpus of templates, so that they can generate templates for novel attribute sets. in contrast, (langkilde and knight, 1998) uses corpus-derived statistical knowledge to rank plausible hypotheses from a grammarbased surface generation component. templates are the easiest way to implement surface nlg. this limitation can be overcome by using features on values, so that nlg2 and nlg3 might discover â€” to use a hypothetical example â€” that "flights leaving $city-fr" is preferred over "flights from $city-fr" when $city-fr is a particular value, such as "miami". our current approach has the limitation that it ignores the values of attributes, even though they might strongly influence the word order and word choice.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2030.txt ADDED Viewed

	@@ -0,0 +1 @@

+ since 1995, a few statistical parsing algorithms (magerman, 1995; collins, 1996 and 1997; charniak, 1997; rathnaparki, 1997) demonstrated a breakthrough in parsing accuracy, as measured against the university of pennsylvania treebank as a gold standard. yet, relatively few have embedded one of these algorithms in a task. chiba, (1999) was able to use such a parsing algorithm to reduce perplexity with the long term goal of improved speech recognition. in this paper, we report adapting a lexicalized, probabilistic context-free parser with head rules (lpcfg-hr) to information extraction. the technique was benchmarked in the seventh message understanding conference (muc-7) in 1998. several technical challenges confronted us and were solved: treebank on wall street journal adequately train the algorithm for new york times newswire, which includes dozens of newspapers? manually creating sourcespecific training data for syntax was not required. instead, our parsing algorithm, trained on the upenn treebank, was run on the new york times source to create unsupervised syntactic training which was constrained to be consistent with semantic annotation.this simple semantic annotation was the only source of task knowledge used to configure the model. instead, our parsing algorithm, trained on the upenn treebank, was run on the new york times source to create unsupervised syntactic training which was constrained to be consistent with semantic annotation. we have demonstrated, at least for one problem, that a lexicalized, probabilistic context-free parser with head rules (lpcfghr) can be used effectively for information extraction. our system for muc-7 consisted of the sentential model described in this paper, coupled with a simple probability model for cross-sentence merging. while performance did not quite match the best previously reported results for any of these three tasks, we were pleased to observe that the scores were at or near state-of-the-art levels for all cases. since 1995, a few statistical parsing algorithms (magerman, 1995; collins, 1996 and 1997; charniak, 1997; rathnaparki, 1997) demonstrated a breakthrough in parsing accuracy, as measured against the university of pennsylvania treebank as a gold standard. for the following example, the template relation in figure 2 was to be generated: "donald m. goldstein, a historian at the university of pittsburgh who helped write..." the semantics â€” that is, the entities and relations â€” can then be directly extracted from these sentential trees.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2031.txt ADDED Viewed

	@@ -0,0 +1 @@

+ parsing sentences using statistical information gathered from a treebank was first examined a decade ago in (chitrad and grishman, 1990) and is by now a fairly well-studied problem ((charniak, 1997), (collins, 1997), (ratnaparkhi, 1997)). but to date, the end product of the parsing process has for the most part been a bracketing with simple constituent labels like np, vp, or sbar. the penn treebank contains a great deal of additional syntactic and semantic information from which to gather statistics; reproducing more of this information automatically is a goal which has so far been mostly ignored. this paper details a process by which some of this information—the function tags— may be recovered automatically. in the penn treebank, there are 20 tags (figure 1) that can be appended to constituent labels in order to indicate additional information about the syntactic or semantic role of the constituent. we have divided them into four categories (given in figure 2) based on those in the bracketing guidelines (bies et al., 1995). a constituent can be tagged with multiple tags, but never with two tags from the same category.1 in actuality, the case where a constituent has tags from all four categories never happens, but constituents with three tags do occur (rarely). at a high level, we can simply say that having the function tag information for a given text is useful just because any further information would help. but specifically, there are distinct advantages for each of the various categories. grammatical tags are useful for any application trying to follow the thread of the text—they find the 'who does what' of each clause, which can be useful to gain information about the situation or to learn more about the behaviour of the words in the sentence. the form/function tags help to find those constituents behaving in ways not conforming to their labelled type, as well as further clarifying the behaviour of adverbial phrases. information retrieval applications specialising in describing events, as with a number of the muc applications, could greatly benefit from some of these in determining the where-when-why of things. noting a topicalised constituent could also prove useful to these applications, and it might also help in discourse analysis, or pronoun resolution. finally, the 'miscellaneous' tags are convenient at various times; particularly the clr 'closely related' tag, which among other things marks phrasal verbs and prepositional ditransitives. to our knowledge, there has been no attempt so far to recover the function tags in parsing treebank text. in fact, we know of only one project that used them at all: (collins, 1997) defines certain constituents as complements based on a combination of label and function tag information. this boolean condition is then used to train an improved parser.this work presents a method for assigning function tags to text that has been parsed to the simple label level. this boolean condition is then used to train an improved parser. in fact, we know of only one project that used them at all: (collins, 1997) defines certain constituents as complements based on a combination of label and function tag information. but to date, the end product of the parsing process has for the most part been a bracketing with simple constituent labels like np, vp, or sbar. â€¢ there is no reason to think that this work could not be integrated directly into the parsing process, particularly if one's parser is already geared partially or entirely towards feature-based statistics; the function tag information could prove quite useful within the parse itself, to rank several parses to find the most plausible. it is as yet unclear just to what degree these tagging errors in the corpus are affecting our results. we have found it useful to define our statistical model in terms of features. there are, it seems, two reasonable baselines for this and future work. this data is very important in distinguishing, for example, 'by john' (where john might be a logical subject) from 'by next year' (a temporal modifier) and 'by selling it' (an adverbial indicating manner).

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A00-2034.txt ADDED Viewed

	@@ -0,0 +1 @@

+ diathesis alternations are alternate ways in which the arguments of a verb are expressed syntactically. the syntactic changes are sometimes accompanied by slight changes in the meaning of the verb. an example of the causative alternation is given in (1) below. in this alternation, the object of the transitive variant can also appear as the subject of the intransitive variant. in the conative alternation, the transitive form alternates with a prepositional phrase construction involving either at or on. an example of the conative alternation is given in (2). we refer to alternations where a particular semantic role appears in different grammatical roles in alternate realisations as "role switching alternations" (rsas). it is these alternations that our method applies to. recently, there has been interest in corpus-based methods to identify alternations (mccarthy and korhonen, 1998; lapata, 1999), and associated verb classifications (stevenson and merlo, 1999). these have either relied on a priori knowledge specified for the alternations in advance, or are not suitable for a wide range of alternations. the fully automatic method outlined here is applied to the causative and conative alternations, but is applicable to other rsas.the fully automatic method outlined here is applied to the causative and conative alternations, but is applicable to other rsas. diathesis alternations are alternate ways in which the arguments of a verb are expressed syntactically. however, a considerably larger corpus would be required to overcome the sparse data problem for other rsa alternations. we have discovered a significant relationship between the similarity of selectional preferences at the target slots, and participation in the causative and conative alternations. diathesis alternations have been proposed for a number of nlp tasks. we propose a method to acquire knowledge of alternation participation directly from corpora, with frequency information available as a by-product. notably, only one negative decision was made because of the disparate frame frequencies, which reduces the cost of combining the argument head data. the syntactic changes are sometimes accompanied by slight changes in the meaning of the verb. these have either relied on a priori knowledge specified for the alternations in advance, or are not suitable for a wide range of alternations. for the conative, a sample of 16 verbs was used and this time accuracy was only 56%. earlier work by resnik (1993) demonstrated a link between selectional preference strength and participation in alternations where the direct object is omitted.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A88-1019.txt ADDED Viewed

	@@ -0,0 +1 @@

+ it is well-known that part of speech depends on context. the word "table," for example, can be a verb in some contexts (e.g., "he will table the motion") and a noun in others (e.g., "the table is ready"). a program has been written which tags each word in an input sentence with the most likely part of speech. the program produces the following output for the two "table" sentences just mentioned: (pps = subject pronoun; md = modal; vb = verb (no inflection); at = article; nn = noun; bez = present 3rd sg form of "to be"; jj = adjective; notation is borrowed from [francis and kucera, pp. 6-8]) part of speech tagging is an important practical problem with potential applications in many areas including speech synthesis, speech recognition, spelling correction, proof-reading, query answering, machine translation and searching large text data bases (e.g., patents, newspapers). the author is particularly interested in speech synthesis applications, where it is clear that pronunciation sometimes depends on part of speech. consider the following three examples where pronunciation depends on part of speech. first, there are words like "wind" where the noun has a different vowel than the verb. that is, the noun "wind" has a short vowel as in "the wind is strong," whereas the verb "wind" has a long vowel as in "don't forget to wind your watch." secondly, the pronoun "that" is stressed as in "did you see that?" unlike the complementizer "that," as in "it is a shame that he's leaving." thirdly, note the difference between "oily fluid" and "transmission fluid"; as a general rule, an adjective-noun sequence such as "oily fluid" is typically stressed on the right whereas a noun-noun sequence such as "transmission fluid" is typically stressed on the left. these are but three of the many constructions which would sound more natural if the synthesizer had access to accurate part of speech information. perhaps the most important application of tagging programs is as a tool for future research. a number of large projects such as [cobuild] have recently been collecting large corpora (101000 million words) in order to better describe how language is actually used in practice: "for the first time, a dictionary has been compiled by the thorough examination of representative group of english texts, spoken and written, running to many millions of words. this means that in addition to all the tools of the conventional dictionary makers... the dictionary is based on hard, measureable evidence." [cobuild, p. xv] it is likely that there will be more and more research projects collecting larger and larger corpora. a reliable parts program might greatly enhance the value of these corpora to many of these researchers. the program uses a linear time dynamic programming algorithm to find an assignment of parts of speech to words that optimizes the product of (a) lexical probabilities (probability of observing part of speech i given word j), and (b) contextual probabilities (probability of observing part of speech i given k previous parts of speech). probability estimates were obtained by training on the tagged brown corpus [francis and kucera], a corpus of approximately 1,000,000 words with part of speech tags assigned laboriously by hand over many years. program performance is encouraging (95-99% "correct", depending on the definition of "correct"). a small 400 word sample is presented in the appendix, and is judged to be 99.5% correct. it is surprising that a local "bottom-up" approach can perform so well. most errors are attributable to defects in the lexicon; remarkably few errors are related to the inadequacies of the extremely over-simplified grammar (a trigram model). apparently, "long distance" dependences are not very important, at least most of the time. one might have thought that ngram models weren't adequate for the task since it is wellknown that they are inadequate for determining grammaticality: "we find that no finite-state markov process that produces symbols with transition from state to state can serve as an english grammar. furthermore, the particular subclass of such processes that produce norder statistical approximations to english do not come closer, with increasing n, to matching the output of an english grammar." [chomsky, p. 113] chomslcy's conclusion was based on the observation that constructions such as: have long distance dependencies that span across any fixed length window n. thus, ngram models are clearly inadequate for many natural language applications. however, for the tagging application, the ngram approximation may be acceptable since long distance dependencies do not seem to be very important. statistical ngram models were quite popular in the 1950s, and have been regaining popularity over the past few years. the ibm speech group is perhaps the strongest advocate of ngram methods, especially in other applications such as speech recognition. robert mercer (private communication, 1982) has experimented with the tagging application, using a restricted corpus (laser patents) and small vocabulary (1000 words). another group of researchers working in lancaster around the same time, leech, garside and atwell, also found ngram models highly effective; they report 96.7% success in automatically tagging the lob corpus, using a bigram model modified with heuristics to cope with more important trigrams. the present work developed independently from the lob project. many people who have not worked in computational linguistics have a strong intuition that lexical ambiguity is usually not much of a problem. it is commonly believed that most words have just one part of speech, and that the few exceptions such as "table" are easily disambiguated by context in most cases. in contrast, most experts in computational linguists have found lexical ambiguity to be a major issue; it is said that practically any content word can be used as a noun, verb or adjective,i and that local context is not always adequate to disambiguate. introductory texts are full of ambiguous sentences such as where no amount of syntactic parsing will help. these examples are generally taken to indicate that the parser must allow for multiple possibilities and that grammar formalisms such as lr(k) are inadequate for natural language since these formalisms cannot cope with ambiguity. this argument was behind a large set of objections to marcus' "lr(k)-like" deterministic parser. although it is clear that an expert in computational linguistics can dream up arbitrarily hard sentences, it may be, as marcus suggested, that most texts are not very hard in practice. recall that marcus hypothesized most decisions can be resolved by the parser within a small window (i.e., three buffer cells), and there are only a few problematic cases where the parser becomes confused. he called these confusing cases "garden paths," by analogy with the famous example: • the horse raced past the barn fell. with just a few exceptions such as these "garden paths," marcus assumes, there is almost always a unique "best" interpretation which can be found with very limited resources. the proposed stochastic approach is largely compatible with this; the proposed approach 1. from an information theory point of view, one can quantity ambiguity in bits. in the case of the brown tagged corpus, the lexical entropy, the conditional entropy of the part of speech given the word is about 0.25 bits per part of speech. this is considerably smaller than the contextual entropy, the conditional entropy of the part of speech given the next two parts of speech. this entropy is estimated to be about 2 bits per part of speech. assumes that it is almost always sufficient to assign each word a unique "best" part of speech (and this can be accomplished with a very efficient linear time dynamic programming algorithm). after reading introductory discussions of "flying planes can be dangerous," one might have expected that lexical ambiguity was so pervasive that it would be hopeless to try to assign just one part of speech to each word and in just one linear time pass over the input words.find all assignments of parts of speech to "a" and score. the proposed method omitted only 5 of 243 noun phrase brackets in the appendix. it is well-known that part of speech depends on context. there is some tendency to underestimate the number of brackets and run two noun phrases together as in [np the time fairchild]. this is considerably smaller than the contextual entropy, the conditional entropy of the part of speech given the next two parts of speech. this entropy is estimated to be about 2 bits per part of speech. assumes that it is almost always sufficient to assign each word a unique "best" part of speech (and this can be accomplished with a very efficient linear time dynamic programming algorithm). a program has been written which tags each word in an input sentence with the most likely part of speech. in the case of the brown tagged corpus, the lexical entropy, the conditional entropy of the part of speech given the word is about 0.25 bits per part of speech. the method works remarkably well considering how simple it is. after reading introductory discussions of "flying planes can be dangerous," one might have expected that lexical ambiguity was so pervasive that it would be hopeless to try to assign just one part of speech to each word and in just one linear time pass over the input words.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A92-1006.txt ADDED Viewed

	@@ -0,0 +1 @@

+ this paper presents the joyce system as an example of a fully-implemented, application-oriented text generation system. joyce covers the whole range of tasks associated with text generation, from content selection to morphological processing. it was developped as part of the interface of the software design environment ulysses. the following design goals were set for it: while we were able to exploit existing research for many of the design issues, it turned out that we needed to develop our own approach to text planning (ra.mbow 1990). this paper will present the system and attempt to show how these design objectives led to particular design decisions. the structure of the paper is as follows. in section 2, we will present the underlying application and give examples of the output of the system. in section 3, we will discuss the overall structure of joyce. we then discuss the three main components in turn: the text planner in section 4, the sentence planner in section 5 and the realizer in section 6. we will discuss the text planner in some detail since it represents a new approach to the problem. section 7 traces the generation of a short text. in section 8, we address the problem of portability, and wind up by discussing some shortcomings of joyce in the conclusion.this paper presents the joyce system as an example of a fully-implemented, application-oriented text generation system. in section 8, we address the problem of portability, and wind up by discussing some shortcomings of joyce in the conclusion. we are aware of several shortcomings of joyce, which we will address in future versions of the system. ple in text planning, it appears to play an important role as a constraint on possible text structures. ii has met the design objectives of speed and quality, and our experience in porting the text generator to new task: and to new applications indicates that joyce is a flexibl( system that can adapt to a variety of text generatior tasks. it passes it through the incrementor to the formater, which downgrades it when a classified corrected reading leaves through p34. initial results, including a prototype, are encouraging. furthermore, it helps determine the use of connectives between rhetorically related clauses. despite these shortcomings, joyce has proven to be a successful and useful tool in the ulysses user interface. the joyce text generation system was developped part of the software design environment ulysses (korelsky and ulysses staff 1988; rosenthal et al 1988) ulysses includes a graphical environment for the design of secure, distributed software systems.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A92-1018.txt ADDED Viewed

	@@ -0,0 +1 @@

+ many words are ambiguous in their part of speech. for example, "tag" can be a noun or a verb. however, when a word appears in the context of other words, the ambiguity is often reduced: in "a tag is a part-of-speech label," the word "tag" can only be a noun. a part-of-speech tagger is a system that uses context to assign parts of speech to words. automatic text tagging is an important first step in discovering the linguistic structure of large text corpora. part-of-speech information facilitates higher-level analysis, such as recognizing noun phrases and other patterns in text. for a tagger to function as a practical component in a language processing system, we believe that a tagger must be: robust text corpora contain ungrammatical constructions, isolated phrases (such as titles), and nonlinguistic data (such as tables). corpora are also likely to contain words that are unknown to the tagger. it is desirable that a tagger deal gracefully with these situations. efficient if a tagger is to be used to analyze arbitrarily large corpora, it must be efficient—performing in time linear in the number of words tagged. any training required should also be fast, enabling rapid turnaround with new corpora and new text genres. accurate a tagger should attempt to assign the correct part-of-speech tag to every word encountered. tunable a tagger should be able to take advantage of linguistic insights. one should be able to correct systematic errors by supplying appropriate a priori "hints." it should be possible to give different hints for different corpora. reusable the effort required to retarget a tagger to new corpora, new tagsets, and new languages should be minimal.reusable the effort required to retarget a tagger to new corpora, new tagsets, and new languages should be minimal. many words are ambiguous in their part of speech. for example, "tag" can be a noun or a verb. the algorithm has an accuracy of approximately 80% in assigning grammatical functions. several different approaches have been used for building text taggers. by using the fact that words are typically associated with only a few part-ofspeech categories, and carefully ordering the computation, the algorithms have linear complexity (section 3.3). one should be able to correct systematic errors by supplying appropriate a priori "hints." it should be possible to give different hints for different corpora. we have used the tagger in a number of applications. if a noun phrase is labeled, it is also annotated as to whether the governing verb is the closest verb group to the right or to the left. we describe three applications here: phrase recognition; word sense disambiguation; and grammatical function assignment. probabilities corresponding to category sequences that never occurred in the training data are assigned small, non-zero values, ensuring that the model will accept any sequence of tokens, while still providing the most likely tagging. vocabulary independence is achieved by predicting categories for words not in the lexicon, using both context and suffix information.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A92-1021.txt ADDED Viewed

	@@ -0,0 +1 @@

+ there has been a dramatic increase in the application of probabilistic models to natural language processing over the last few years. the appeal of stochastic techniques over traditional rule-based techniques comes from the ease with which the necessary statistics can be automatically acquired and the fact that very little handcrafted knowledge need be built into the system. in contrast, the rules in rule-based systems are usually difficult to construct and are typically not very robust. one area in which the statistical approach has done particularly well is automatic part of speech tagging, assigning each word in an input sentence its proper part of speech [church 88; cutting et al. 92; derose 88; deroualt and merialdo 86; garside et al. 87; jelinek 85; kupiec 89; meteer et al. 911. stochastic taggers have obtained a high degree of accuracy without performing any syntactic analysis on the input. these stochastic part of speech taggers make use of a markov model which captures lexical and contextual information. the parameters of the model can be estimated from tagged ([church 88; derose 88; deroualt and merialdo 86; garside et al. 87; meteer et al. 91]) or untag,ged ([cutting et al. 92; jelinek 85; kupiec 89]) text. once the parameters of the model are estimated, a sentence can then be automatically tagged by assigning it the tag sequence which is assigned the highest probability by the model. performance is often enhanced with the aid of various higher level pre- and postprocessing procedures or by manually tuning the model. a number of rule-based taggers have been built [klein and simmons 63; green and rubin 71; hindle 89]. [klein and simmons 63] and [green and rubin 71] both have error rates substantially higher than state of the art stochastic taggers. [hindle 89] disambiguates words within a deterministic parser. we wanted to determine whether a simple rule-based tagger without any knowledge of syntax can perform as well as a stochastic tagger, or if part of speech tagging really is a domain to which stochastic techniques are better suited. in this paper we describe a rule-based tagger which performs as well as taggers based upon probabilistic models. the rule-based tagger overcomes the limitations common in rule-based approaches to language processing: it is robust, and the rules are automatically acquired. in addition, the tagger has many advantages over stochastic taggers, including: a vast reduction in stored information required, the perspicuity of a small set of meaningful rules as opposed to the large tables of statistics needed for stochastic taggers, ease of finding and implementing improvements to the tagger, and better portability from one tag set or corpus genre to another.we have presented a simple part of speech tagger which performs as well as existing stochastic taggers, but has significant advantages over these taggers. there has been a dramatic increase in the application of probabilistic models to natural language processing over the last few years. the fact that the simple rule-based tagger can perform so well should offer encouragement for researchers to further explore rule-based tagging, searching for a better and more expressive set of patch templates and other variations on this simple but effective theme. the rule-based tagger overcomes the limitations common in rule-based approaches to language processing: it is robust, and the rules are automatically acquired. the tagger is extremely portable. the appeal of stochastic techniques over traditional rule-based techniques comes from the ease with which the necessary statistics can be automatically acquired and the fact that very little handcrafted knowledge need be built into the system. perhaps the biggest contribution of this work is in demonstrating that the stochastic method is not the only viable approach for part of speech tagging. in this paper we describe a rule-based tagger which performs as well as taggers based upon probabilistic models. this makes it easy to experiment with extensions to the tagger.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A94-1006.txt ADDED Viewed

	@@ -0,0 +1 @@

+ the statistical corpus-based renaissance in computational linguistics has produced a number of interesting technologies, including part-of-speech tagging and bilingual word alignment. unfortunately, these technologies are still not as widely deployed in practical applications as they might be. part-ofspeech taggers are used in a few applications, such as speech synthesis (sproat et al., 1992) and question answering (kupiec, 1993b). word alignment is newer, found only in a few places (gale and church, 1991a; brown et al., 1993; dagan et al., 1993). it is used at ibm for estimating parameters of their statistical machine translation prototype (brown et al., 1993). we suggest that part of speech tagging and word alignment could have an important role in glossary construction for translation. glossaries are extremely important for translation. how would microsoft, or some other software vendor, want the term "character menu" to be translated in their manuals? technical terms are difficult for translators because they are generally not as familiar with the subject domain as either the author of the source text or the reader of the target text. in many cases, there may be a number of acceptable translations, but it is important for the sake of consistency to standardize on a single one. it would be unacceptable for a manual to use a variety of synonyms for a particular menu or button. customarily, translation houses make extensive job-specific glossaries to ensure consistency and correctness of technical terminology for large jobs. a glossary is a list of terms and their translations.' we will subdivide the task of constructing a glossary into two subtasks: (1) generating a list of terms, and (2) finding the translation equivalents. the first task will be referred to as the monolingual task and the second as the bilingual task. how should a glossary be constructed? translation schools teach their students to read as much background material as possible in both the source and target languages, an extremely time-consuming process, as the introduction to hann's (1992, p. 8) text on technical translation indicates: contrary to popular opinion, the job of a technical translator has little in common with other linguistic professions, such as literature translation, foreign correspondence or interpreting. apart from an expert knowledge of both languages..., all that is required for the latter professions is a few general dictionaries, whereas a technical translator needs a whole library of specialized dictionaries, encyclopedias and 'the source and target fields are standard, though many other fields can also be found, e.g., usage notes, part of speech constraints, comments, etc. technical literature in both languages; he is more concerned with the exact meanings of terms than with stylistic considerations and his profession requires certain 'detective' skills as well as linguistic and literary ones. beginners in this profession have an especially hard time... this book attempts to meet this requirement. unfortunately, the academic prescriptions are often too expensive for commercial practice. translators need just-in-time glossaries. they cannot afford to do a lot of background reading and "detective" work when they are being paid by the word. they need something more practical. we propose a tool, termight, that automates some of the more tedious and laborious aspects of terminology research. the tool relies on part-of-speech tagging and word-alignment technologies to extract candidate terms and translations. it then sorts the extracted candidates and presents them to the user along with reference concordance lines, supporting efficient construction of glossaries. the tool is currently being used by the translators at at&t business translation services (formerly at&t language line services). termight may prove useful in contexts other than human-based translation. primarily, it can support customization of machine translation (mt) lexicons to a new domain. in fact, the arguments for constructing a job-specific glossary for human-based translation may hold equally well for an mt-based process, emphasizing the need for a productivity tool. the monolingual component of termight can be used to construct terminology lists in other applications, such as technical writing, book indexing, hypertext linking, natural language interfaces, text categorization and indexing in digital libraries and information retrieval (salton, 1988; cherry, 1990; harding, 1982; bourigault, 1992; damerau, 1993), while the bilingual component can be useful for information retrieval in multilingual text collections (landauer and littman, 1990).we have shown that terminology research provides a good application for robust natural language technology, in particular for part-of-speech tagging and word-alignment algorithms. the statistical corpus-based renaissance in computational linguistics has produced a number of interesting technologies, including part-of-speech tagging and bilingual word alignment. in particular, we have found the following to be very effective: as the need for efficient knowledge acquisition tools becomes widely recognized, we hope that this experience with termight will be found useful for other text-related systems as well. in fact, the arguments for constructing a job-specific glossary for human-based translation may hold equally well for an mt-based process, emphasizing the need for a productivity tool. unfortunately, these technologies are still not as widely deployed in practical applications as they might be. primarily, it can support customization of machine translation (mt) lexicons to a new domain. part-ofspeech taggers are used in a few applications, such as speech synthesis (sproat et al., 1992) and question answering (kupiec, 1993b). termight may prove useful in contexts other than human-based translation. word alignment is newer, found only in a few places (gale and church, 1991a; brown et al., 1993; dagan et al., 1993). the monolingual component of termight can be used to construct terminology lists in other applications, such as technical writing, book indexing, hypertext linking, natural language interfaces, text categorization and indexing in digital libraries and information retrieval (salton, 1988; cherry, 1990; harding, 1982; bourigault, 1992; damerau, 1993), while the bilingual component can be useful for information retrieval in multilingual text collections (landauer and littman, 1990).

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A94-1009.txt ADDED Viewed

	@@ -0,0 +1 @@

+ part-of-speech tagging is the process of assigning grammatical categories to individual words in a corpus. one widely used approach makes use of a statistical technique called a hidden markov model (hmm). the model is defined by two collections of parameters: the transition probabilities, which express the probability that a tag follows the preceding one (or two for a second order model); and the lexical probabilities, giving the probability that a word has a given tag without regard to words on either side of it. to tag a text, the tags with non-zero probability are hypothesised for each word, and the most probable sequence of tags given the sequence of words is determined from the probabilities. two algorithms are commonly used, known as the forward-backward (fb) and viterbi algorithms. fb assigns a probability to every tag on every word, while viterbi prunes tags which cannot be chosen because their probability is lower than the ones of competing hypotheses, with a corresponding gain in computational efficiency. for an introduction to the algorithms, see cutting et at. (1992), or the lucid description by sharman (1990). there are two principal sources for the parameters of the model. if a tagged corpus prepared by a human annotator is available, the transition and lexical probabilities can be estimated from the frequencies of pairs of tags and of tags associated with words. alternatively, a procedure called baumwelch (bw) re-estimation may be used, in which an untagged corpus is passed through the fb algorithm with some initial model, and the resulting probabilities used to determine new values for the lexical and transition probabilities. by iterating the algorithm with the same corpus, the parameters of the model can be made to converge on values which are locally optimal for the given text. the degree of convergence can be measured using a perplexity measure, the sum of plog2p for hypothesis probabilities p, which gives an estimate of the degree of disorder in the model. the algorithm is again described by cutting et ad. and by sharman, and a mathematical justification for it can be found in huang et at. (1990). the first major use of hmms for part of speech tagging was in claws (garside et a/., 1987) in the 1970s. with the availability of large corpora and fast computers, there has been a recent resurgence of interest, and a number of variations on and alternatives to the fb, viterbi and bw algorithms have been tried; see the work of, for example, church (church, 1988), brill (brill and marcus, 1992; brill, 1992), derose (derose, 1988) and kupiec (kupiec, 1992). one of the most effective taggers based on a pure hmm is that developed at xerox (cutting et al., 1992). an important aspect of this tagger is that it will give good accuracy with a minimal amount of manually tagged training data. 96% accuracy correct assignment of tags to word token, compared with a human annotator, is quoted, over a 500000 word corpus. the xerox tagger attempts to avoid the need for a hand-tagged training corpus as far as possible. instead, an approximate model is constructed by hand, which is then improved by bw re-estimation on an untagged training corpus. in the above example, 8 iterations were sufficient. the initial model set up so that some transitions and some tags in the lexicon are favoured, and hence having a higher initial probability. convergence of the model is improved by keeping the number of parameters in the model down. to assist in this, low frequency items in the lexicon are grouped together into equivalence classes, such that all words in a given equivalence class have the same tags and lexical probabilities, and whenever one of the words is looked up, then the data common to all of them is used. re-estimation on any of the words in a class therefore counts towards re-estimation for all of them'. the results of the xerox experiment appear very encouraging. preparing tagged corpora either by hand is labour-intensive and potentially error-prone, and although a semi-automatic approach can be used (marcus et al., 1993), it is a good thing to reduce the human involvement as much as possible. however, some careful examination of the experiment is needed. in the first place, cutting et a/. do not compare the success rate in their work with that achieved from a hand-tagged training text with no re-estimation. secondly, it is unclear how much the initial biasing contributes the success rate. if significant human intervention is needed to provide the biasing, then the advantages of automatic training become rather weaker, especially if such intervention is needed on each new text domain. the kind of biasing cutting et a/. describe reflects linguistic insights combined with an understanding of the predictions a tagger could reasonably be expected to make and the ones it could not. the aim of this paper is to examine the role that training plays in the tagging process, by an experimental evaluation of how the accuracy of the tagger varies with the initial conditions. the results suggest that a completely unconstrained initial model does not produce good quality results, and that one 'the technique was originally developed by kupiec (kupiec, 1989). accurately trained from a hand-tagged corpus will generally do better than using an approach based on re-estimation, even when the training comes from a different source. a second experiment shows that there are different patterns of re-estimation, and that these patterns vary more or less regularly with a broad characterisation of the initial conditions. the outcome of the two experiments together points to heuristics for making effective use of training and reestimation, together with some directions for further research. work similar to that described here has been carried out by merialdo (1994), with broadly similar conclusions. we will discuss this work below. the principal contribution of this work is to separate the effect of the lexical and transition parameters of the model, and to show how the results vary with different degree of similarity between the training and test data.in the end it may turn out there is simply no way of making the prediction without a source of information extrinsic to both model and corpus. part-of-speech tagging is the process of assigning grammatical categories to individual words in a corpus. the principal contribution of this work is to separate the effect of the lexical and transition parameters of the model, and to show how the results vary with different degree of similarity between the training and test data. from the observations in the previous section, we propose the following guidelines for how to train a hmm for use in tagging: able, use bw re-estimation with standard convergence tests such as perplexity. one widely used approach makes use of a statistical technique called a hidden markov model (hmm). we will discuss this work below. work similar to that described here has been carried out by merialdo (1994), with broadly similar conclusions. the general pattern of the results presented does not vary greatly with the corpus and tagset used. to tag a text, the tags with non-zero probability are hypothesised for each word, and the most probable sequence of tags given the sequence of words is determined from the probabilities.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A94-1016.txt ADDED Viewed

	@@ -0,0 +1 @@

+ machine-readable dictionary (the collins spanish/english), the lexicons used by the kbmt modules, a large set of user-generated bilingual glossaries as well as a gazetteer and a list of proper and organization names. the outputs from these engines (target language words and phrases) are recorded in a chart whose positions correspond to words in the source language input. as a result of the operation of each of the mt engines, new edges are added to the chart, each labeled with the translation of a region of the input string and indexed by this region's beginning and end positions. we will refer to all of these edges as components (as in "components of the translation") for the remainder of this article. the kbmt and ebmt engines also carry a quality score for each output element. the kbmt scores are produced based on whether any questionable heuristics were used in the source analysis or target generation. the ebmt scores are produced using a technique based on human judgements, as described in (nirenburg et al., 1994a), submitted. figure 1 presents a general view of the operation of our multi-engine mt system. the chart manager selects the overall best cover from the collection of candidate partial translations by normalizing each component's quality score (positive, with larger being better), and then selecting the best combination of components with the help of the chart walk algorithm. figure 2 illustrates the result of this process on the example spanish sentence: al momenta de su yenta a iberia, viasa contaba con ocho aviones, que tenzan en promedio 13 anos de vuelo which can be translated into english as at the moment of its sale to iberia, viasa had eight airplanes, which had on average thirteen years of flight (time). this is a sentence from one of the 1993 arpa mt evaluation texts. for each component, the starting and ending positions in the chart, the corresponding source language words, and alternative translations are shown, as well as the engine and the engine-internal quality scores. inspection of these translations shows numerous problems; for example, at position 12, "aviones" is translated, among other things, as "aircrafts". it must be remembered that these were generated automatically from an on-line dictionary, without any lexical feature marking or other human intervention. it is well known that such automatic methods are at the moment less than perfect, to say the least. in our current system, this is not a major problem, since the results go through a mandatory editing step, as described below. the chart manager normalizes the internal scores to make them directly comparable. in the case of kbmt and ebmt, the pre-existing scores are modified, while lexical transfer results are scored based on the estimated reliability of individual databases, from 0.5 up to 15. currently the kbmt scores are reduced by a constant, except for known erroneous output, which has its score set to zero. the internal ebmt scores range from 0 being perfect to 10,000 being worthless; but the scores are nonlinear. so a region selected by a threshold is converted linearly into scores ranging from zero to a normalized maximum ebmt score. the normalization levels were empirically determined in the initial experiment by having several individuals judge the comparative average quality of the outputs in an actual translation run. in every case, the base score produced by the scoring functions is currently multiplied by the length of the candidate in words, on the assumption that longer items are better. we intend to test a variety of functions in order to find the right contribution of the length factor. figure 3 presents the chart walk algorithm used to produce a single, best, non-overlapping, contiguous combination (cover) of the available component translations, assuming correct component quality scores. the code is organized as a recursive divideand-conquer procedure: to calculate the cover of a region of the input, it is repeatedly split into two parts, at each possible position. each time, the best possible cover for each part is recursively found, and the two scores are combined to give a score for the chart walk containing the two best subwalks. these different splits are then compared with each other and with components from the chart spanning the whole region (if any), and the overall best result is without dynamic programming, this would have a d 2 combinatorial time complexity. dynamic programl 2.5 ming utilizes a large array to store partial results, so that the best cover of any given subsequence is only computed once; the second time that a recursive call would compute the same result, it is retrieved from the array instead. this reduces the time complexity to 0(n3), and in practice it uses an insignificant part of total processing time. g 5 all possible combinations of components are cornd 2 pared: this is not a heuristic method, but an efficient exhaustive one. this is what assures that the chog 5 sen cover is optimal. this assumes, in addition to the scores actually being correct, that the scores are compositional, in the sense that the combined score for a set of components really represents their quality as a group. this might not be the case, for example, if gaps or overlaps are allowed in some cases (perhaps where they contain the same words in the same positions). we calculate the combined score for a sequence of d 2 components as the weighted average of their individual scores. weighting by length is necessary so that g 5 the same components, when combined in a different order, produce the same combined scores. otherwise the algorithm can produce inconsistent results. e 8.8 the chart walk algorithm can also be thought of as filling in the two-dimensional dynamic-programming arrayl . figure 4 shows an intermediate point in the filling of the array. in this figure, each element (i,j) is initially the best score of any single chart compod 2 nent covering the input region from word i to word j. dashes indicate that no one component covers exnote that this array is a different data structure from the chart. actly that region. (in rows 1 through 7, the array has not yet been operated on, so it still shows its initial state.) after processing (see rows 9 through 22), each element is the score for the best set of components covering the input from word i to word j (the best cover for this substring)2. (only a truncated score is shown for each element in the figure, for readability. there is also a list of best components associated with each element.) the array is upper triangular since the starting position of a component i must be less than or equal to its ending position j. for any position, the score is calculated based on a combination of scores in the row to its left and in the column below it, versus the previous contents of the array cell for its position. so the array must be filled from the bottom-up, and left to right. intuitively, this is because larger regions must be built up from smaller regions within them. for example, to calculate element (8,10), we compute the length-weighted averages of the scores of the best walks over the pair of elements (8,8) and (9,10) versus the pair (8,9) and (10,10), and compare them with the scores of any single chart components going from 8 to 10 (there were none), and take the maximum. referring to figure 2 again, this corresponds to a choice between combining the translations of (8,8) viasa and (9,10) contaba con versus combining the (not shown) translations of (8,9) viasa contaba and (10,10) con. (this (8,9) element was itself previously built up from single word components.) thus, we compare (2*1+ 10*2)/3 = 7.33 with (3.5*2+2*1)/3 = 3.0 and select the first, 7.33. the first wins because contaba con has a high score as an idiom from the glossary. figure 5 shows the final array. when the element in the top-right corner is produced (5.78), the algorithm is finished, and the associated set of components is the final chart walk result shown in figure 2. it may seem that the scores should increase towards the top-right corner. this has not generally been the case. while the system produces a number of high-scoring short components, many lowscoring components have to be included to span the entire input. since the score is a weighted average, these low-scoring components pull the combined score down. a clear example can be seen at position (18,18), which has a score of 15. the scores above and to its right each average this 15 with a 5, for total values of 10.0 (all the lengths happen to be 1), and the score continues to decrease with distance from this point as one moves towards the final score, which does include the component for (18,18) in the cover. the chart-oriented integration of mt engines does not easily support deviations from the linear order of the source text elements, as when discontinuous constituents translate contiguous strings or in the case of cross-component substring order differences. we use a language pair-dependent set of postprocessing rules to alleviate this (for example, by switching the order of adjacent single-word adjective and noun components).we use a language pair-dependent set of postprocessing rules to alleviate this (for example, by switching the order of adjacent single-word adjective and noun components). the outputs from these engines (target language words and phrases) are recorded in a chart whose positions correspond to words in the source language input. ultimately, a multi-engine system depends on the quality of each particular engine. the chart-oriented integration of mt engines does not easily support deviations from the linear order of the source text elements, as when discontinuous constituents translate contiguous strings or in the case of cross-component substring order differences. a less ambitious version of this idea would be to run the low-scoring engines only where there are gaps in the normally high-scoring engines. as a result of the operation of each of the mt engines, new edges are added to the chart, each labeled with the translation of a region of the input string and indexed by this region's beginning and end positions. machine-readable dictionary (the collins spanish/english), the lexicons used by the kbmt modules, a large set of user-generated bilingual glossaries as well as a gazetteer and a list of proper and organization names. a clear example can be seen at position (18,18), which has a score of 15.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1004.txt ADDED Viewed

	@@ -0,0 +1 @@

+ the task of identifying sentence boundaries in text has not received as much attention as it deserves. many freely available natural language processing tools require their input to be divided into sentences, but make no mention of how to accomplish this (e.g. (brill, 1994; collins, 1996)). others perform the division implicitly without discussing performance (e.g. (cutting et al., 1992)). on first glance, it may appear that using a short list, of sentence-final punctuation marks, such as ., ?, and !, is sufficient. however, these punctuation marks are not used exclusively to mark sentence breaks. for example, embedded quotations may contain any of the sentence-ending punctuation marks and . is used as a decimal point, in email addresses, to indicate ellipsis and in abbreviations. both ! and ? are somewhat less ambiguous *the authors would like to acknowledge the support of arpa grant n66001-94-c-6043, aro grant daah0494-g-0426 and nsf grant sbr89-20230. but appear in proper names and may be used multiple times for emphasis to mark a single sentence boundary. lexically-based rules could be written and exception lists used to disambiguate the difficult cases described above. however, the lists will never be exhaustive, and multiple rules may interact badly since punctuation marks exhibit absorption properties. sites which logically should be marked with multiple punctuation marks will often only have one ((nunberg, 1990) as summarized in (white, 1995)). for example, a sentence-ending abbreviation will most likely not be followed by an additional period if the abbreviation already contains one (e.g. note that d.0 is followed by only a single . in the president lives in washington, d.c.). as a result, we believe that manually writing rules is not a good approach. instead, we present a solution based on a maximum entropy model which requires a few hints about what. information to use and a corpus annotated with sentence boundaries. the model trains easily and performs comparably to systems that require vastly more information. training on 39441 sentences takes 18 minutes on a sun ultra sparc and disambiguating the boundaries in a single wall street journal article requires only 1.4 seconds.the task of identifying sentence boundaries in text has not received as much attention as it deserves. training on 39441 sentences takes 18 minutes on a sun ultra sparc and disambiguating the boundaries in a single wall street journal article requires only 1.4 seconds. we would also like to thank the anonymous reviewers for their helpful insights. we would like to thank david palmer for giving us the test data he and marti hearst used for their sentence detection experiments. many freely available natural language processing tools require their input to be divided into sentences, but make no mention of how to accomplish this (e.g. we have described an approach to identifying sentence boundaries which performs comparably to other state-of-the-art systems that require vastly more resources. the model trains easily and performs comparably to systems that require vastly more information. to our knowledge, there have been few papers about identifying sentence boundaries. furthermore, we showed tha.t a small training corpus is sufficient for good performance, and we estimate that annotating enough data to achieve good performance would require only several hours of work, in comparison to the many hours required to generate pos tag and lexical probabilities. liberman and church suggest in (liberma.n and church, 1992) that a. system could be quickly built to divide newswire text into sentences with a nearly negligible error rate, but do not actually build such a system.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1011.txt ADDED Viewed

	@@ -0,0 +1 @@

+ we are concerned with surface-syntactic parsing of running text. our main goal is to describe syntactic analyses of sentences using dependency links that show the head-modifier relations between words. in addition, these links have labels that refer to the syntactic function of the modifying word. a simplified example is in figure 1, where the link between i and see denotes that i is the modifier of see and its syntactic function is that of subject. similarly, a modifies bird, and it is a determiner. first, in this paper, we explain some central concepts of the constraint grammar framework from which many of the ideas are derived. then, we give some linguistic background to the notations we are using, with a brief comparison to other current dependency formalisms and systems. new formalism is described briefly, and it is utilised in a small toy grammar to illustrate how the formalism works. finally, the real parsing system, with a grammar of some 2 500 rules, is evaluated. the parser corresponds to over three man-years of work, which does not include the lexical analyser and the morphological disambiguator, both parts of the existing english constraint grammar parser (karlsson et al., 1995). the parsers can be tested via www'.we are concerned with surface-syntactic parsing of running text. the parsers can be tested via www'. voutilainen and juha heikkild created the original engcg lexicon. we are using atro voutilainen's (1995) improved part-of-speech disambiguation grammar which runs in the cg-2 parser. however, the comparison to other current systems suggests that our dependency parser is very promising both theoretically and practically. in this paper, we have presented some main features of our new framework for dependency syntax. our work is partly based on the work done with the constraint grammar framework that was originally proposed by fred karlsson (1990). for instance, our main goal is to describe syntactic analyses of sentences using dependency links that show the head-modifier relations between words. the distinction between the complements and the adjuncts is vague in the implementation; neither the complements nor the adjuncts are obligatory. the results are not strictly comparable because the syntactic description is somewhat different. the evaluation was done using small excerpts of data, not used in the development of the system. means that a nominal head (nom-head is a set that contains part-of-speech tags that may represent a nominal head) may not appear anywhere to the left (not *-1). for instance, the verb decide has the tag <p/on> which means that the prepositional phrase on is typically attached to it.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1014.txt ADDED Viewed

	@@ -0,0 +1 @@

+ the work reported in this paper aims at providing syntactically annotated corpora (treebanks') for stochastic grammar induction. in particular, we focus on several methodological issues concerning the annotation of non-configurational languages. in section 2, we examine the appropriateness of existing annotation schemes. on the basis of these considerations, we formulate several additional requirements. a formalism complying with these requirements is described in section 3. section 4 deals with the treatment of selected phenomena. for a description of the annotation tool see section 5.for a description of the annotation tool see section 5. as the annotation scheme described in this paper focusses on annotating argument structure rather than constituent trees, it differs from existing treebanks in several aspects. its extension is subject to further investigations. the work reported in this paper aims at providing syntactically annotated corpora (treebanks') for stochastic grammar induction. the development of linguistically interpreted corpora presents a laborious and time-consuming task. combining raw language data with linguistic information offers a promising basis for the development of new efficient and robust nlp methods. these differences can be illustrated by a comparison with the penn treebank annotation scheme. partial automation included in the current version significantly reduces the manna.1 effort. a uniform representation of local and non-local dependencies makes the structure more transparent'. owing to the partial automation, the average annotation efficiency improves by 25% (from around 4 minutes to 3 minutes per sentence). such a word order independent representation has the advantage of all structural information being encoded in a single data structure. realworld texts annotated with different strata of linguistic information can be used for grammar induction. in order to make the annotation process more efficient, extra effort has been put. into the development of an annotation tool.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1029.txt ADDED Viewed

	@@ -0,0 +1 @@

+ in the past decade, the speech recognition community has had huge successes in applying hidden markov models, or hmm's to their problems. more recently, the natural language processing community has effectively employed these models for part-ofspeech tagging, as in the seminal (church, 1988) and other, more recent efforts (weischedel et al., 1993). we would now propose that hmm's have successfully been applied to the problem of name-finding. we have built a named-entity (ne) recognition system using a slightly-modified version of an hmm; we call our system "nymble". to our knowledge, nymble out-performs the best published results of any other learning name-finder. furthermore, it performs at or above the 90% accuracy level, often considered "near-human performance". the system arose from the ne task as specified in the last message understanding conference (muc), where organization names, person names, location names, times, dates, percentages and money amounts were to be delimited in text using sgml-markup. we will describe the various models employed, the methods for training these models and the method for "decoding" on test data (the term "decoding" borrowed from the speech recognition community, since one goal of traversing an hmm is to recover the hidden state sequence). to date, we have successfully trained and used the model on both english and spanish, the latter for met, the multi-lingual entity task.we have shown that using a fairly simple probabilistic model, finding names and other numerical entities as specified by the muc tasks can be performed with "near-human performance", often likened to an f of 90 or above. to date, we have successfully trained and used the model on both english and spanish, the latter for met, the multi-lingual entity task. in the past decade, the speech recognition community has had huge successes in applying hidden markov models, or hmm's to their problems. given the incredibly difficult nature of many nlp tasks, this example of a learned, stochastic approach to name-finding lends credence to the argument that the nlp community ought to push these approaches, to find the limit of phenomena that may be captured by probabilistic, finite-state methods. also, name-finding can be directly employed for link analysis and other information retrieval problems. the basic premise of the approach is to consider the raw text encountered when decoding as though it had passed through a noisy channel, where it had been originally marked with named entities.' we would like to incorporate the following into the current model: while our initial results have been quite favorable, there is still much that can be done potentially to improve performance and completely close the gap between learned and rule-based name-finding systems.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1030.txt ADDED Viewed

	@@ -0,0 +1 @@

+ text processing applications, such as machine translation systems, information retrieval systems or natural-language understanding systems, need to identify multi-word expressions that refer to proper names of people, organizations, places, laws and other entities. when encountering mrs. candy hill in input text, for example, a machine translation system should not attempt to look up the translation of candy and hill, but should translate mrs. to the appropriate personal title in the target language and preserve the rest of the name intact. similarly, an information retrieval system should not attempt to expand candy to all of its morphological variants or suggest synonyms (wacholder et al. 1994). the need to identify proper names has two aspects: the recognition of known names and the discovery of new names. since obtaining and maintaining a name database requires significant effort, many applications need to operate in the absence of such a resource. without a database, names need to be discovered in the text and linked to entities they refer to. even where name databases exist, text needs to be scanned for new names that are formed when entities, such as countries or commercial companies, are created, or for unknown names which become important when the entities they refer to become topical. this situation is the norm for dynamic applications such as news providing services or internet information indexing. the next section describes the different types of proper name ambiguities we have observed. section 3 discusses the role of context and world knowledge in their disambiguation; section 4 describes the process of name discovery as implemented in nominator, a module for proper name recognition developed at the ibm t.j. watson research center. sections 5-7 elaborate on nominator's disambiguation heuristics.ambiguity remains one of the main challenges in the processing of natural language text. because of these difficulties, we believe that for the forseeable future, practical applications to discover new names in text will continue to require the sort of human effort invested in nominator. text processing applications, such as machine translation systems, information retrieval systems or natural-language understanding systems, need to identify multi-word expressions that refer to proper names of people, organizations, places, laws and other entities. sections 5-7 elaborate on nominator's disambiguation heuristics. name identification requires resolution of a subset of the types of structural and semantic ambiguities encountered in the analysis of nouns and noun phrases (nps) in natural language processing. many of these uncategorized names are titles of articles, books and other works of art that we currently do not handle. in the rest of the paper we describe the resources and heuristics we have designed and implemented in nominator and the extent to which they resolve these ambiguities. an evaluation of an earlier version of nominator, was performed on 88 wall street journal documents (nist 1993) that had been set aside for testing. all of these ambiguities must be dealt with if proper names are to be identified correctly.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1039.txt ADDED Viewed

	@@ -0,0 +1 @@

+ systems that generate natural language output as part of their interaction with a user have become a major area of research and development. typically, natural language generation is divided into several phases, namely text planning (determining output content and structure), sentence planning (determining abstract target language resources to express content, such as lexical items and syntactic constructions), and realization (producing the final text string) (reiter, 1994). while text and sentence planning may sometimes be combined, a realizer is almost always included as a distinct module. it is in the realizer that knowledge about the target language resides (syntax, morphology, idiosyncratic properties of lexical items). realization is fairly well understood both from a linguistic and from a computational point of view, and therefore most projects that use text generation do not include the realizer in the scope of their research. instead, such projects use an off-the-shelf realizer, among which penman (bateman, 1996) and surge/fuf (elhadad and robin, 1996) are probably the most popular. in this technical note and demo we present a new off-theshelf realizer, realpro. realpro is derived from previous systems (iordanskaja et al., 1988; iordanslcaja et al., 1992; rambow and korelsky, 1992), but represents a new design and a completely new implementation. realpro has the following characteristics, which we believe are unique in this combination: we reserve a more detailed comparison with penman and fuf, as well as with alethgen/gl (coch, 1996) (which is perhaps the system most similar to realpro, since they are based on the same linguistic theory and are both implemented with speed in mind), for a more extensive paper. this technical note presents realpro, concentrating on its structure, its coverage, its interfaces, and its performance.this technical note presents realpro, concentrating on its structure, its coverage, its interfaces, and its performance. systems that generate natural language output as part of their interaction with a user have become a major area of research and development. the development of realpro was partially supported by usaf rome laboratory under contracts f3060293-c-0015, f30602-94-c-0124, and f30602-92-c-0163, and by darpa under contracts f30602-95-2-0005 and f30602-96-c-0220. we are grateful to r. kittredge, t. korelsky, d. mccullough, a. nasr, e. reiter, and m. white as well as to three anonymous reviewers for helpful comments about earlier drafts of this technical note and/or about realpro. the input to realpro is a syntactic dependency structure. this means that realpro gives the developer control over the output, while taking care of the linguistic details. realpro is licensed free of charge to qualified academic institutions, and is licensed for a fee to commercial sites. the system is fully operational, runs on pc as well as on unix work stations, and is currently used in an application we have developed (lavoie et al., 1997) as well as in several on-going projects (weather report generation, machine translation, project report generation). the architecture of realpro is based on meaningtext theory, which posits a sequence of correspondences between different levels of representation.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/A97-1052.txt ADDED Viewed

	@@ -0,0 +1 @@

+ predicate subcategorization is a key component of a lexical entry, because most, if not all, recent syntactic theories 'project' syntactic structure from the lexicon. therefore, a wide-coverage parser utilizing such a lexicalist grammar must have access to an accurate and comprehensive dictionary encoding (at a minimum) the number and category of a predicate's arguments and ideally also information about control with predicative arguments, semantic selection preferences on arguments, and so forth, to allow the recovery of the correct predicate-argument structure. if the parser uses statistical techniques to rank analyses, it is also critical that the dictionary encode the relative frequency of distinct subcategorization classes for each predicate. several substantial machine-readable subcategorization dictionaries exist for english, either built largely automatically from machine-readable versions of conventional learners' dictionaries, or manually by (computational) linguists (e.g. the alvey nl tools (anlt) dictionary, boguraev et al. (1987); the comlex syntax dictionary, grishman et al. (1994)). unfortunately, neither approach can yield a genuinely accurate or comprehensive computational lexicon, because both rest ultimately on the manual efforts of lexicographers / linguists and are, therefore, prone to errors of omission and commission which are hard or impossible to detect automatically (e.g. boguraev & briscoe, 1989; see also section 3.1 below for an example). furthermore, manual encoding is labour intensive and, therefore, it is costly to extend it to neologisms, information not currently encoded (such as relative frequency of different subcategorizations), or other (sub)languages. these problems are compounded by the fact that predicate subcategorization is closely associated to lexical sense and the senses of a word change between corpora, sublanguages and/or subject domains (jensen, 1991). in a recent experiment with a wide-coverage parsing system utilizing a lexicalist grammatical framework, briscoe & carroll (1993) observed that half of parse failures on unseen test data were caused by inaccurate subcategorization information in the anlt dictionary. the close connection between sense and subcategorization and between subject domain and sense makes it likely that a fully accurate 'static' subcategorization dictionary of a language is unattainable in any case. moreover, although schabes (1992) and others have proposed `lexicalized' probabilistic grammars to improve the accuracy of parse ranking, no wide-coverage parser has yet been constructed incorporating probabilities of different subcategorizations for individual predicates, because of the problems of accurately estimating them. these problems suggest that automatic construction or updating of subcategorization dictionaries from textual corpora is a more promising avenue to pursue. preliminary experiments acquiring a few verbal subcategorization classes have been reported by brent (1991, 1993), manning (1993), and ushioda et at. (1993). in these experiments the maximum number of distinct subcategorization classes recognized is sixteen, and only ushioda et at. attempt to derive relative subcategorization frequency for individual predicates. we describe a new system capable of distinguishing 160 verbal subcategorization classes—a superset of those found in the anlt and comlex syntax dictionaries. the classes also incorporate information about control of predicative arguments and alternations such as particle movement and extraposition. we report an initial experiment which demonstrates that this system is capable of acquiring the subcategorization classes of verbs and the relative frequencies of these classes with comparable accuracy to the less ambitious extant systems. we achieve this performance by exploiting a more sophisticated robust statistical parser which yields complete though 'shallow' parses, a more comprehensive subcategorization class classifier, and a priori estimates of the probability of membership of these classes. we also describe a small-scale experiment which demonstrates that subcategorization class frequency information for individual verbs can be used to improve parsing accuracy.we also describe a small-scale experiment which demonstrates that subcategorization class frequency information for individual verbs can be used to improve parsing accuracy. predicate subcategorization is a key component of a lexical entry, because most, if not all, recent syntactic theories 'project' syntactic structure from the lexicon. the experiment and comparison reported above suggests that our more comprehensive subcategorization class extractor is able both to assign classes to individual verbal predicates and also to rank them according to relative frequency with comparable accuracy to extant systems. boguraev & briscoe, 1987). we achieve this performance by exploiting a more sophisticated robust statistical parser which yields complete though 'shallow' parses, a more comprehensive subcategorization class classifier, and a priori estimates of the probability of membership of these classes. we have also demonstrated that a subcategorization dictionary built with the system can improve the accuracy of a probabilistic parser by an appreciable amount. if the parser uses statistical techniques to rank analyses, it is also critical that the dictionary encode the relative frequency of distinct subcategorization classes for each predicate. we report an initial experiment which demonstrates that this system is capable of acquiring the subcategorization classes of verbs and the relative frequencies of these classes with comparable accuracy to the less ambitious extant systems.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-1007.txt ADDED Viewed

	@@ -0,0 +1 @@

+ moreover, in ma w cases it; is very important not to deviate from certain linguis- tic standards in generation, in which case hand- crafted grammars give excellent control. how- ever, in other applications tbr nlg the variety of the output is much bigger, and the demands on the quality of the output somewhat less strin- gent. a typical example is nlg in the con- text of (interlingua- or transthr-based) machine translation. another reason for reb~xing the quality of the output may be that not enough time is available to develop a flfll grammar tbr a new target language in nlg. in all these cases, stochastic ("empiricist") methods pro- vide an alternative to hand-crafted ("rational- ist") approaches to nlg. to our knowledge, the first to use stochastic techniques in nlg were langkilde and knight (1998a) and (1998b). in this paper, we present fergus (flexible em- piricist/rationalist generation using syntax). fertgus follows langkilde and knights seminal work in using an n-gram language model, but; we augment it with a tree-based stochastic model and a traditional tree-based syntactic grammar. more recent work on aspects of stochastic gen- eration include (langkilde and knight, 2000), (malouf, 1999) and (ratnaparkhi, 2000). betbre we describe in more detail how we use stochastic models in nlg, we recall the basic tasks in nlg (rainbow and korelsky, 1992; re- iter, 1994). during text p lanning, content and structure of the target text; are determined to achieve the overall communicative goal. dur- ing sentence planning, linguistic means - in particular, lexical and syntactic means are de- termined to convey smaller pieces of meaning. l)uring real izat ion, the specification chosen in sentence planning is transtbrmed into a surface string, by line~rizing and intlecting words in the sentence (and typically, adding function words). as in the work by langkilde and knight, our work ignores the text planning stage, but it; does address the sentence, planning and the realiza- tion stages. the structure of the paper is as tbllows.explo i t ing a probabi l ist ic hierarchical mode l for generat ion srinivas bangalore and owen rambow at&t labs research 180 park avenue f lorham park, nj 07932 {sr in?, rambow}@research, a r t .

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-1044.txt ADDED Viewed

	@@ -0,0 +1 @@

+ such features include sense, register, do- main spccilicity, pragmatic restrictions on usage, scnlan- lic markcdncss, and orientation, as well as automatically ictcnlifiecl links between words (e.g., semantic rclalcd- hess, syllollynly, antonylny, and tneronymy). aulomal- ically learning features of this type from hugc corpora allows the construction or augmentation of lexicons, and the assignment of scmanlic htbcls lo words and phrases in running text. this information in turn can bc used to help dcterlninc addilional features at the it?teal, clause, sentence, or document level. tiffs paper explores lira benelits that some lexical fea- tures of adjectives offer lor the prediction of a contexlual sentence-level feature, suojectivity. subjectivity in nat- ural language re[crs to aspects of language used to ex- press opinions and ewfluations. the computatiomtl task addressed here is to distinguish sentences used to present opinions and other tbrms of subjectivity (suojective sen- tences, e.g., "at several different layers, its a fascinating title") from sentences used to objectively present factual information (objective sentences, e.g., "bell industries inc. increased its quarterly to 10 cents from 7 cents a share"). much research in discourse processing has focused on task-oriented and insmmtional dialogs. the task ad- dressed here comes to the fore in other genres, especially news reporting and lnternet lorums, in which opinions of various agents are expressed and where subjectivity judgements couht help in recognizing inllammatory rues- sages ("llanles) and mining online sources for product reviews. ()thor (asks for whicll subjectivity recognition is potentially very useful include infornmtion extraction and information retrieval. assigning sub.icctivity labels to documents or portions of documents is an example of non-topical characteri?ation f information. current in- formation extraction and rolricval lechnology focuses al- most exclusively on lhe subject matter of the documcnls. yet, additiomtl components of a document inllucncc its relevance to imrlicuhu ? users or tasks, including, for ex- alnple, the evidential slatus el: lhc material presented, and attitudes adopted in fawn" or against a lmrticular person, event, or posilion (e.g., articles on a presidenlial cam- paign wrillen to promote a specific candidate). in sum- marization, subjectivity judgmcnls could be included in documcllt proiilcs to augment aulomatically produced docunacnt summaries, and to hel l) the user make rele- vance judgments when using a search engine. ()thor work on sub.iectivity (wicbc et al., 1999; bruce and wicbc, 2000) has established a positive and statisti- cally signilicant correlation with the presence of adiec- lives.effects of adjective orientation and gradability on sentence subjectivity vas i le ios hatz ivass i log lou depar tment o1 computer sc ience co lumbia un ivers i l y new york, ny 10027 vh@cs , co lumbia , edu janyce m. wiebe depar tment o f computer sc ience new mex ico state un ivers i ty las cruces , nm 88003 w iebe@cs , nmsu.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-1072.txt ADDED Viewed

	@@ -0,0 +1 @@

+ toi)ic signatures can lie used to identify the t)resence of a (:omph~x conce.pt a concept hat consists of several related coinl)onents in fixed relationships. ]~.c.stauvant-uisit, for examph~, invoh,es at h,ast the concel)ts lltcgfit, t.(tt, pay, and possibly waiter, all(l dragon boat pcstivai (in tat- wan) involves the ct)llc(!l)t,s cal(tlztlt,s (a talisman to ward off evil), rnoza (something with the t)ower of preventing pestilen(:e and strengthening health), pic- tures of ch, un9 kuei (a nemesis of evil spirits), eggs standing on end, etc. only when the concepts co- occur is one licensed to infer the comph:x concept; cat or moza alone, for example, are not sufficient. at this time, we do not c.onsider the imerrelationships among tile concepts. since many texts may describe all the compo- nents of a comi)lex concept without ever exi)lic- itly mentioning the mlderlying complex concel/t--a tol)ic--itself, systems that have to identify topic(s), for summarization or information retrieval, require a method of infcuring comt)hx concellts flom their component words in the text. 2 re la ted work in late 1970s, ])e.long (dejong, 1982) developed a system called i"tiump (fast reading understand- ing and memory program) to skim newspaper sto- ries and extract the main details. frump uses a data structure called sketchy script to organize its world knowhdge. each sketchy script is what frumi ) knows al)out what can occur in l)articu- lar situations such as denmnstrations, earthquakes, labor strike.s, an(t so on. frump selects a t)artic- ular sketchy script based on clues to styled events in news articles. in other words, frump selects an eml)t3 ~ t(uni)late 1whose slots will be tilled on the fly as t"f[ump reads a news artme. a summary is gen- erated })ased on what has been (:al)tured or filled in the teml)iate. the recent success of infornmtion extractk)n re- search has encoreaged the fi{um1 ) api)roach. the summons (summarizing online news artmes) system (mckeown and radev, 1999) takes tem- l)late outputs of information extra(:tion systems de- velofmd for muc conference and generating smn- maries of multit)le news artmes. frump and sum- mons both rely on t/rior knowledge of their do- mains, th)wever, to acquire such t)rior knowledge is lal)or-intensive and time-consuming. i~)r exam-- l)le, the unive.rsity of massa(:husetts circus sys- l.enl use(l ill the muc-3 (saic, 1998) terrorism do- main required about 1500 i)erson-llours to define ex- traction lmtterns 2 (rilotf, 1996).the automated acquisit ion of topic signatures for text summarizat ion chin -yew l in and eduard hovy in fo rmat ion s(:i(umes i l l s t i tu te un ivers i ty of southern ca l i fo rn ia mar ina del rey, ca 90292, usa { cyl,hovy }c~isi.edu abst rac t in order to produce, a good summary, one has to identify the most relevant portions of a given text.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-2136.txt ADDED Viewed

	@@ -0,0 +1 @@

+ we evaluate exdisco by com- paring the pertbrmance of discovered patterns against that of manually constructed systems on actual extraction tasks. 0 introduct ion intbrmation extraction is the selective xtrac- tion of specified types of intbrmation from nat- ural language text. the intbrmation to be extracted may consist of particular semantic classes of objects (entities), relationships among these entities, and events in which these entities participate. the extraction system places this intbrmation into a data base tbr retrieval and subsequent processing. in this paper we shall be concerned primar- ily with the extraction of intbrmation about events. in the terminology which has evolved tiom the message understanding conferences (muc, 1995; muc, 1993), we shall use the term subject domain to refer to a broad class of texts, such as business news, and tile term scenario to refer to tile specification of tile particular events to be extracted. for example, the "manage- ment succession" scenario for muc-6, which we shall refer to throughout this paper, involves in- formation about corporate executives tarting and leaving positions. the fundamental problem we face in port- ing an extraction system to a new scenario is to identify the many ways in which intbrmation about a type of event may be expressed in the text;. typically, there will be a few common tbrms of expression which will quickly come to nfind when a system is being developed. how- ever, the beauty of natural language (and the challenge tbr computational linguists) is that there are many variants which an imaginative writer cast use, and which the system needs to capture. finding these variants may involve studying very large amounts of text; in the sub- ject domain. this has been a major impediment to the portability and performance of event ex- traction systems. we present; in this paper a new approach to finding these variants automatically flom a large corpus, without the need to read or amlo- tate the corpus. this approach as been evalu- ated on actual event extraction scenarios. in the next section we outline the strncture of our extraction system, and describe the discov- ery task in the context of this system.automatic acquisition of domain knowledge for information extraction roman yangarber, ralph grishman past tapanainen courant inst i tute of conexor oy mathemat ica l sciences helsinki, f in land new york university {roman [ grishman}@cs, nyu.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-2137.txt ADDED Viewed

	@@ -0,0 +1 @@

+ 5/]lell ,]le lcsllll;s are better with the new tcch- ni(lue , a question arises as t() wh(,l;h(;r these l:(`-- sult; (litleren(:es are due t() the new technique a(:t;ually 1)eing l)cl;t(x or just; due 1;o (:han(:e. un- tortmmtely, one usually callll()t) directly answer the qnesl;ion "what is the 1)robatfility that 1;11(; now l;(x:hni(luc, is t)el;lx~r givell l;he results on the t(,sl, dal;a sol;": i)(new technique is better [ test set results) ]~ul; with statistics, one cml answer the follow- ing proxy question: if the new technique was a(> tually no ditterent han the old t(,(hnique ((;he * this paper reports on work l)erfonncd at the mitr1,; corporation under the sul)porl: of the mitilj,; ,qponsored research l)rogrmn. warren grcit[, l ,ynette il irschlnm b christilm l)orall, john llen(lerson, kelmeth church, ted l)unning, wessel kraaij, milch marcus and an anony- mous reviewer l)rovided hell)rid suggestions. copyright @2000 the mitre corl)oration. all rights r(~s(nvcd. null hyl)othesis), wh~tt is 1:11(; 1)robat)ility that the results on the test set would l)e at least this skewed in the new techniques favor (box eta] . thai; is, what is p(test se, t results at least this skew(a in the new techni(lues favor i new technique is no (liffercnt than the old) if the i)robtfl)ility is small enough (5% off;on is used as the threshold), then one will rqiect the mill hyi)othems and say that the differences in 1;he results are :sta.tisl;ically siglfilicant" ai; that thrt,shold level. this 1)al)(n" examines some of th(`- 1)ossil)le me?hods for trying to detect statistically signif- leant diflelenc(`-s in three commonly used met- li(:s: telall, 1)re(ision and balanced f-score. many of these met;ire(is arc foun(t to be i)rol)lem- a.ti(" ill a, so, t; of exl)erinw, nts that are performed. thes(~ methods have a, tendency to ullderesti- mat(`- th(, signili(:ance, of the results, which tends t() 1hake one, 1)elieve thai; some new techni(tuc is no 1)el;l;er l;lmn the (:urrent technique even when il; is. this mtderest imate comes flom these lnc|h- ells assuming l;hat; the te(:hlfi(tues being con> lmrcd produce indepen(lc, nt results when in our exl)eriments , the techniques 1)eing coml)ared tend to 1)reduce l)ositively corr(`-lated results. to handle this problem, we, point out some st~ttistical tests, like the lnatche(t-pair t, sign and wilcoxon tests (harnett, 1982, see. 8.7 and 15.5), which do not make this assulnption. one call its(, l;llcse tes ts oll i;hc recall nlel;r ic, but l;he precision an(l 1)alanced f-score metric have too coml)lex a tbrm for these tests. for such com- 1)lex lne|;ri(;s~ we llse a colnplll;e-in|;clisiv(~ ran- domization test (cohen, 1995, sec. 5.3), which also ~tvoids this indet)en(lence assmnption.more accurate tes ts ibr the s ta t i s t i ca l s ign i f i cance of resu l t d i f ferences * alexander yeh mitre corp. 202 burli l lgl;on rd.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C00-2163.txt ADDED Viewed

	@@ -0,0 +1 @@

+ here .fi = f denotes tile (15ench) source and e{ = e denotes the (english) target string. most smt models (brown et al., 1993; vogel et al., 1996) try to model word-to-word corresl)ondences between source and target words using an alignment nmpl)ing from source l)osition j to target position i = aj. we can rewrite tim t)robal)ility pr(fille~) t) 3, in- troducing the hidden alignments ai 1 := al ...aj...a.l (aj c {0 , . , /} ) : pr(f~lel) = ~pr(f i ,a~le{) .1 ? j -1 i~ = e h pr(fj ajlf i -"al e l ) q, j=l to allow fbr french words wlfich do not directly cor- respond to any english word an artificial empty word c0 is added to the target sentence at position i=0. the different alignment models we present pro- vide different decoint)ositions of pr(f~,a~le(). an alignnlent 5~ for which holds a~ = argmax pr(fi , al[ei) at for a specific model is called v i terb i al ignment of" this model. in this paper we will describe extensions to tile hidden-markov alignment model froln (vogel et al., 1.996) and compare tlmse to models 1 - 4 of (brown et al., 1993). we t)roi)ose to measure the quality of an alignment nlodel using the quality of tlle viterbi alignment compared to a manually-produced align- ment. this has the advantage that once having pro- duced a reference alignlnent, the evaluation itself can be performed automatically. in addition, it results in a very precise and relia.ble valuation criterion which is well suited to assess various design decisions in modeling and training of statistical alignment mod- els. it, is well known that manually pertbrming a word aligmnent is a colnplicated and ambiguous task (melamed, 1998). therefore, to produce tlle refer- ence alignment we use a relined annotation scheme which reduces the complications and mnbiguities oc- curring in the immual construction of a word align- ment. as we use tile alignment models for machine translation purposes, we also evahlate the resulting translation quality of different nlodels. 2 al ignment w i th hmm in the hidden-markov alignment model we assume a first-order dependence for tim aligmnents aj and that the translation probability depends olfly on aj and not oil (tj_l: - ~- el) =p(ajl.a compar i son of a l ignment mode ls for s ta t i s t i ca l mach ine trans la t ion franz josef och and hermann ney lehrstuhl fiir informatik vi, comlmter science department rwth aachen - university of technology d-52056 aachen, germany {och, ney}~inf ormat ik.

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C02-1011.txt ADDED Viewed

	@@ -0,0 +1 @@

+ we address here the problem of base np translation, in which for a given base noun phrase in a source language (e.g., ?information age? in english), we are to find out its possible translation(s) in a target language (e.g., ? in chinese). we define a base np as a simple and non-recursive noun phrase. in many cases, base nps represent holistic and non-divisible concepts, and thus accurate translation of them from one language to another is extremely important in applications like machine translation, cross language information retrieval, and foreign language writing assistance. in this paper, we propose a new method for base np translation, which contains two steps: (1) translation candidate collection, and (2) translation selection. in translation candidate collection, for a given base np in the source language, we look for its translation candidates in the target language. to do so, we use a word-to-word translation dictionary and corpus data in the target language on the web. in translation selection, we determine the possible translation(s) from among the candidates. we use non-parallel corpus data in the two languages on the web and employ one of the two methods which we have developed. in the first method, we view the problem as that of classification and employ an ensemble of na?ve bayesian classifiers constructed with the em algorithm. we will use ?em-nbc-ensemble? to denote this method, hereafter. in the second method, we view the problem as that of calculating similarities between context vectors and use tf-idf vectors also constructed with the em algorithm. we will use ?em-tf-idf? to denote this method. experimental results indicate that our method is very effective, and the coverage and top 3 accuracy of translation at the final stage are 91.4% and 79.8%, respectively. the results are significantly better than those of the baseline methods relying on existing technologies. the higher performance of our method can be attributed to the enormity of the web data used and the employment of the em algorithm.discriminatively trained taggers, on the other hand, have difficulties to handle the huge number of features which are active at the same time if any possible combination of context attributes defines a separate feature. we presented a hmm pos tagger for fine-grained tagsets which splits the pos tags into attributevectors and estimates the conditional probabilities of the attributes with decision trees. the backoff smoothing methods of traditional n-gram pos taggers require an ordering of the reduced contexts which is not available, here. in ex periments with german and czech corpora, this method achieved a higher tagging accuracy than two state-of-the-art general-purpose pos taggers (tnt and svmtool). context prob. decision trees are ideal for this task because the iden tification of relevant attribute combinations is at the heart of this method. a hidden-markov-model part-of-speech tagger (brants, 2000, e.g.) computes the most probable pos tag sequence ? t n 1 = ? t 1 , ..., ? t n for a given word sequence w n 1 . ? t n 1 = argmax t n 1 p(t n 1 , w n 1 )the joint probability of the two sequences is de fined as the product of context probabilities and lexical probabilities over all pos tags: p(t n 1 , w n 1 ) = n ? i=1 p(t i |t i?1 i?k ) ? ??

drive/MyDrive/RA_Internship/HIPORANK/DATASET_HIPORANK/dataset/inputs/C02-1054.txt ADDED Viewed

	@@ -0,0 +1 @@

+ named entity (ne) recognition is a task in whichproper nouns and numerical information in a docu ment are detected and classified into categories suchas person, organization, and date. it is a key technol ogy of information extraction and open-domain question answering (voorhees and harman, 2000). we are building a trainable open-domain question answering system called saiqa-ii. in this paper, we show that an ne recognizer based on support vector machines (svms) gives better scores thanconventional systems. svms have given high per formance in various classification tasks (joachims, 1998; kudo and matsumoto, 2001). however, it turned out that off-the-shelf svm classifiers are too inefficient for ne recognition. the recognizer runs at a rate of only 85 bytes/sec on an athlon 1.3 ghz linux pc, while rule-based systems (e.g., isozaki, (2001)) can process several kilobytes in a second. the major reason is the inefficiency of svm classifiers. there are otherreports on the slowness of svm classifiers. another svm-based ne recognizer (yamada and mat sumoto, 2001) is 0.8 sentences/sec on a pentium iii 933 mhz pc. an svm-based part-of-speech (pos). tagger (nakagawa et al, 2001) is 20 tokens/sec on an alpha 21164a 500 mhz processor. it is difficult to use such slow systems in practical applications. in this paper, we present a method that makes the ne system substantially faster. this method can also be applied to other tasks in natural languageprocessing such as chunking and pos tagging. another problem with svms is its incomprehensibil ity. it is not clear which features are important or how they work. the above method is also useful for finding useless features. we also mention a method to reduce training time. 1.1 support vector machines. suppose we have a set of training data for a two class problem: , where ffflfi is a feature vector of the ffi -th sample in the training data and !$#%# is the label forthe sample. the goal is to find a decision func tion that accurately predicts for unseen . a non-linear svm classifier gives a decision function ( ) * sign ,+-) for an input vector where +-) .* / 0 21)3 546879: !6; here, () *=!$# means is a member of a cer tain class and () $* # means is not a mem ber. 7 s are called support vectors and are repre sentatives of training examples. is the numberof support vectors. therefore, computational com plexity of +?) is proportional to . support vectorsand other constants are determined by solving a cer tain quadratic programming problem. 4687@ is akernel that implicitly maps vectors into a higher di mensional space. typical kernels use dot products: 4687@ a*cbed7@ . a polynomial kernel of degree fis given by bg? *hi#j!kg l . we can use vari mm m m n m m m m m m m m m n m o o o o o n o o o o o o o o o o o o m : positive example, o : negative example n m , n o : support vectors figure 1: support vector machine ous kernels, and the design of an appropriate kernel for a particular application is an important research issue.figure 1 shows a linearly separable case. the de cision hyperplane defined by +-) p*rq separatespositive and negative examples by the largest mar gin. the solid line indicates the decision hyperplaneand two parallel dotted lines indicate the margin be tween positive and negative examples. since such aseparating hyperplane may not exist, a positive pa rameter s is introduced to allow misclassifications. see vapnik (1995). 1.2 svm-based ne recognition. as far as we know, the first svm-based ne system was proposed by yamada et al (2001) for japanese.his system is an extension of kudo?s chunking sys tem (kudo and matsumoto, 2001) that gave the best performance at conll-2000 shared tasks. in theirsystem, every word in a sentence is classified sequentially from the beginning or the end of a sen tence. however, since yamada has not compared it with other methods under the same conditions, it is not clear whether his ne system is better or not. here, we show that our svm-based ne system ismore accurate than conventional systems. our sys tem uses the viterbi search (allen, 1995) instead of sequential determination.for training, we use ?crl data?, which was prepared for irex (information retrieval and extrac tion exercise1, sekine and eriguchi (2000)). it has about 19,000 nes in 1,174 articles. we also use additional data by isozaki (2001). both datasets are based on mainichi newspaper?s 1994 and 1995 cd-roms. we use irex?s formal test data calledgeneral that has 1,510 named entities in 71 ar ticles from mainichi newspaper of 1999. systems are compared in terms of general?s f-measure 1http://cs.nyu.edu/cs/projects/proteus/irexwhich is the harmonic mean of ?recall? and ?preci sion? and is defined as follows. recall = m/(the number of correct nes), precision = m/(the number of nes extracted by a system), where m is the number of nes correctly extracted and classified by the system.we developed an svm-based ne system by following our ne system based on maximum entropy (me) modeling (isozaki, 2001). we sim ply replaced the me model with svm classifiers.the above datasets are processed by a morphological analyzer chasen 2.2.12. it tokenizes a sen tence into words and adds pos tags. chasen uses about 90 pos tags such as common-noun and location-name. since most unknown words are proper nouns, chasen?s parameters for unknownwords are modified for better results. then, a char acter type tag is added to each word. it uses 17character types such as all-kanji and small integer. see isozaki (2001) for details. now, japanese ne recognition is solved by theclassification of words (sekine et al, 1998; borth wick, 1999; uchimoto et al, 2000). for instance, the words in ?president george herbert bush saidclinton is . . . are classified as follows: ?president? = other, ?george? = person-begin, ?her bert? = person-middle, ?bush? = person-end, ?said? = other, ?clinton? = person-single, ?is? = other. in this way, the first word of a person?s name is labeled as person-begin. the last word is labeled as person-end. other words in the nameare person-middle. if a person?s name is expressed by a single word, it is labeled as person single. if a word does not belong to any namedentities, it is labeled as other. since irex de fines eight ne classes, words are classified into 33 ( *utwvex!k# ) categories.each sample is represented by 15 features be cause each word has three features (part-of-speech tag, character type, and the word itself), and two preceding words and two succeeding words are also used for context dependence. although infrequent features are usually removed to prevent overfitting, we use all features because svms are robust. each sample is represented by a long binary vector, i.e., a sequence of 0 (false) and 1 (true). for instance, ?bush? in the above example is represented by a 2http://chasen.aist-nara.ac.jp/ vector p*yg[z\#^]_ g[z `a] described below. only 15 elements are 1. bdcfe8ghji // current word is not ?alice? bdc klghme // current word is ?bush? bdc nghji // current word is not ?charlie? : bdcfe^opikpqpghme // current pos is a proper noun bdcfe^opinipghji // current pos is not a verb : bdc nqre^sre ghji // previous word is not ?henry? bdc nqre^skghme // previous word is ?herbert? :here, we have to consider the following problems. first, svms can solve only a two-class problem. therefore, we have to reduce the above multi class problem to a group of two-class problems. second, we have to consider consistency among word classes in a sentence. for instance, a word classified as person-begin should be followed by person-middle or person-end. it impliesthat the system has to determine the best combina tions of word classes from numerous possibilities.here, we solve these problems by combining exist ing methods. there are a few approaches to extend svms to cover t -class problems. here, we employ the ?oneclass versus all others? approach. that is, each clas sifier (%u ) is trained to distinguish members of a class v from non-members. in this method, two or more classifiers may give !$# to an unseen vector or no classifier may give !$# . one common way to avoid such situations is to compare + u ) values and to choose the class index v of the largest + u ) . the consistency problem is solved by the viterbi search. since svms do not output probabilities, we use the svm+sigmoid method (platt, 2000). that is, we use a sigmoid function wxg? j*y#zi#{! |l}~ {g to map + u ) to a probability-like value. the output of the viterbi search is adjusted by a postprocessor for wrong word boundaries. the adjustment rules are also statistically determined (isozaki, 2001). 1.3 comparison of ne recognizers. we use a fixed value ?* #q9q . f-measures are not very sensitive to unless is too small. whenwe used 1,038,986 training vectors, general?s f measure was 89.64% for ?*?q?# and 90.03% for 6*?#q9q . we employ the quadratic kernel ( f *y? ) because it gives the best results. polynomial kernels of degree 1, 2, and 3 resulted in 83.03%, 88.31%, f-measure (%) ? ? rg+dt ? ? me ? ? svm 0 20 40 60 80 100 120 crl data ???e? ?^??:??? 76 78 80 82 84 86 88 90 number of nes in training data ( ?? ) figure 2: f-measures of ne systems and 87.04% respectively when we used 569,994 training vectors. figure 2 compares ne recognizers in terms ofgeneral?s f-measures. ?svm? in the figure in dicates f-measures of our system trained by kudo?s tinysvm-0.073 with s?*?q?# . it attained 85.04% when we used only crl data. ?me? indicates our me system and ?rg+dt? indicates a rule-basedmachine learning system (isozaki, 2001). according to this graph, ?svm? is better than the other sys tems.however, svm classifiers are too slow. fa mous svm-light 3.50 (joachims, 1999) took 1.2 days to classify 569,994 vectors derived from 2 mb documents. that is, it runs at only 19 bytes/sec. tinysvm?s classifier seems best optimized among publicly available svm toolkits, but it still works at only 92 bytes/sec.tinysvm?s classifier seems best optimized among publicly available svm toolkits, but it still works at only 92 bytes/sec. named entity (ne) recognition is a task in whichproper nouns and numerical information in a docu ment are detected and classified into categories suchas person, organization, and date. that is, it runs at only 19 bytes/sec. is better than the other sys tems.however, svm classifiers are too slow. it is a key technol ogy of information extraction and open-domain question answering (voorhees and harman, 2000). our svm-based ne recognizer attained f = 90.03%. we are building a trainable open-domain question answering system called saiqa-ii. in this paper, we show that an ne recognizer based on support vector machines (svms) gives better scores thanconventional systems. according to this graph, ?svm? indicates a rule-basedmachine learning system (isozaki, 2001). svms have given high per formance in various classification tasks (joachims, 1998; kudo and matsumoto, 2001). fa mous svm-light 3.50 (joachims, 1999) took 1.2 days to classify 569,994 vectors derived from 2 mb documents. however, it turned out that off-the-shelf svm classifiers are too inefficient for ne recognition. ?me? indicates our me system and ?rg+dt? the major reason is the inefficiency of svm classifiers. we also thank shigeru katagiri and ken-ichiro ishii for their support.