Clean duplicates in user history

#159
by Wauplin HF staff - opened
Files changed (1) hide show
  1. user_history.py +81 -112
user_history.py CHANGED
@@ -15,6 +15,7 @@ Useful links:
15
  - Source file: https://huggingface.co/spaces/Wauplin/gradio-user-history/blob/main/user_history.py
16
  - Discussions: https://huggingface.co/spaces/Wauplin/gradio-user-history/discussions
17
  """
 
18
  import json
19
  import os
20
  import shutil
@@ -37,8 +38,11 @@ def setup(folder_path: str | Path | None = None) -> None:
37
  user_history.folder_path = _resolve_folder_path(folder_path)
38
  user_history.initialized = True
39
 
40
- # TODO: remove this section once all Spaces have migrated
41
- _migrate_history()
 
 
 
42
 
43
 
44
  def render() -> None:
@@ -46,9 +50,7 @@ def render() -> None:
46
 
47
  # initialize with default config
48
  if not user_history.initialized:
49
- print(
50
- "Initializing user history with default config. Use `user_history.setup(...)` to customize folder_path."
51
- )
52
  setup()
53
 
54
  # Render user history tab
@@ -83,18 +85,11 @@ def render() -> None:
83
 
84
  # "Export zip" row (hidden by default)
85
  with gr.Row():
86
- export_file = gr.File(
87
- file_count="single",
88
- file_types=[".zip"],
89
- label="Exported history",
90
- visible=False,
91
- )
92
 
93
  # "Config deletion" row (hidden by default)
94
  with gr.Row():
95
- confirm_button = gr.Button(
96
- "Confirm delete all history", variant="stop", visible=False
97
- )
98
  cancel_button = gr.Button("Cancel", visible=False)
99
 
100
  # Gallery
@@ -117,12 +112,8 @@ def render() -> None:
117
  gallery.attach_load_event(_fetch_user_history, every=None)
118
 
119
  # Interactions
120
- refresh_button.click(
121
- fn=_fetch_user_history, inputs=[], outputs=[gallery], queue=False
122
- )
123
- export_button.click(
124
- fn=_export_user_history, inputs=[], outputs=[export_file], queue=False
125
- )
126
 
127
  # Taken from https://github.com/gradio-app/gradio/issues/3324#issuecomment-1446382045
128
  delete_button.click(
@@ -203,9 +194,7 @@ class _UserHistory(object):
203
 
204
  def _user_lock(self, username: str) -> FileLock:
205
  """Ensure history is not corrupted if concurrent calls."""
206
- return FileLock(
207
- self.folder_path / f"{username}.lock"
208
- ) # lock outside of folder => better when exporting ZIP
209
 
210
  def _user_jsonl_path(self, username: str) -> Path:
211
  return self._user_path(username) / "history.jsonl"
@@ -225,9 +214,7 @@ def _fetch_user_history(profile: gr.OAuthProfile | None) -> List[Tuple[str, str]
225
 
226
  user_history = _UserHistory()
227
  if not user_history.initialized:
228
- warnings.warn(
229
- "User history is not set in Gradio demo. You must use `user_history.render(...)` first."
230
- )
231
  return []
232
 
233
  with user_history._user_lock(username):
@@ -253,17 +240,13 @@ def _export_user_history(profile: gr.OAuthProfile | None) -> Dict | None:
253
 
254
  user_history = _UserHistory()
255
  if not user_history.initialized:
256
- warnings.warn(
257
- "User history is not set in Gradio demo. You must use `user_history.render(...)` first."
258
- )
259
  return None
260
 
261
  # Zip history
262
  with user_history._user_lock(username):
263
  path = shutil.make_archive(
264
- str(_archives_path() / f"history_{username}"),
265
- "zip",
266
- user_history._user_path(username),
267
  )
268
 
269
  return gr.update(visible=True, value=path)
@@ -278,9 +261,7 @@ def _delete_user_history(profile: gr.OAuthProfile | None) -> None:
278
 
279
  user_history = _UserHistory()
280
  if not user_history.initialized:
281
- warnings.warn(
282
- "User history is not set in Gradio demo. You must use `user_history.render(...)` first."
283
- )
284
  return
285
 
286
  with user_history._user_lock(username):
@@ -317,9 +298,7 @@ def _resolve_folder_path(folder_path: str | Path | None) -> Path:
317
  if folder_path is not None:
318
  return Path(folder_path).expanduser().resolve()
319
 
320
- if os.getenv("SYSTEM") == "spaces" and os.path.exists(
321
- "/data"
322
- ): # Persistent storage is enabled!
323
  return Path("/data") / "_user_history"
324
 
325
  # Not in a Space or Persistent storage not enabled => local folder
@@ -380,10 +359,8 @@ def _get_nb_users() -> int:
380
  user_history = _UserHistory()
381
  if not user_history.initialized:
382
  return 0
383
- if user_history.folder_path is not None:
384
- return len(
385
- [path for path in user_history.folder_path.iterdir() if path.is_dir()]
386
- )
387
  return 0
388
 
389
 
@@ -391,7 +368,7 @@ def _get_nb_images() -> int:
391
  user_history = _UserHistory()
392
  if not user_history.initialized:
393
  return 0
394
- if user_history.folder_path is not None:
395
  return len([path for path in user_history.folder_path.glob("*/images/*")])
396
  return 0
397
 
@@ -425,14 +402,10 @@ def _disk_space_warning_message() -> str:
425
 
426
 
427
  def _get_disk_usage(path: Path) -> Tuple[int, int, int]:
428
- for path in [path] + list(
429
- path.parents
430
- ): # first check target_dir, then each parents one by one
431
  try:
432
  return shutil.disk_usage(path)
433
- except (
434
- OSError
435
- ): # if doesn't exist or can't read => fail silently and try parent one
436
  pass
437
  return 0, 0, 0
438
 
@@ -451,74 +424,70 @@ def _fetch_admins() -> List[str]:
451
  # Running in Space => try to fetch organization members
452
  # Otherwise, it's not an organization => namespace is the user
453
  namespace = space_id.split("/")[0]
454
- response = requests.get(
455
- f"https://huggingface.co/api/organizations/{namespace}/members"
456
- )
457
  if response.status_code == 200:
458
- return sorted(
459
- (member["user"] for member in response.json()), key=lambda x: x.lower()
460
- )
461
  return [namespace]
462
 
463
 
464
- ################################################################
465
- # Legacy helpers to migrate image structure to new data format #
466
- ################################################################
467
- # TODO: remove this section once all Spaces have migrated
468
 
469
-
470
- def _migrate_history():
471
- """Script to migrate user history from v0 to v1."""
472
- legacy_history_path = _legacy_get_history_folder_path()
473
- if not legacy_history_path.exists():
474
- return
475
-
476
- error_count = 0
477
- for json_path in legacy_history_path.glob("*.json"):
478
- username = json_path.stem
479
- print(f"Migrating history for user {username}...")
480
- error_count += _legacy_move_user_history(username)
481
- print("Done.")
482
- print(f"Migration complete. {error_count} error(s) happened.")
483
-
484
- if error_count == 0:
485
- shutil.rmtree(legacy_history_path, ignore_errors=True)
486
-
487
-
488
- def _legacy_move_user_history(username: str) -> int:
489
- history = _legacy_read_user_history(username)
490
- error_count = 0
491
- for image, prompt in reversed(history):
492
- try:
493
- save_image(
494
- label=prompt, image=image, profile={"preferred_username": username}
495
- )
496
- except Exception as e:
497
- print("Issue while migrating image:", e)
498
- error_count += 1
499
- return error_count
500
 
501
 
502
- def _legacy_get_history_folder_path() -> Path:
503
- _folder = os.environ.get("HISTORY_FOLDER")
504
- if _folder is None:
505
- _folder = Path(__file__).parent / "history"
506
- return Path(_folder)
507
-
508
-
509
- def _legacy_read_user_history(username: str) -> List[Tuple[str, str]]:
510
- """Return saved history for that user."""
511
- with _legacy_user_lock(username):
512
- path = _legacy_user_history_path(username)
513
- if path.exists():
514
- return json.loads(path.read_text())
515
- return [] # No history yet
516
-
517
-
518
- def _legacy_user_history_path(username: str) -> Path:
519
- return _legacy_get_history_folder_path() / f"{username}.json"
520
-
521
 
522
- def _legacy_user_lock(username: str) -> FileLock:
523
- """Ensure history is not corrupted if concurrent calls."""
524
- return FileLock(f"{_legacy_user_history_path(username)}.lock")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  - Source file: https://huggingface.co/spaces/Wauplin/gradio-user-history/blob/main/user_history.py
16
  - Discussions: https://huggingface.co/spaces/Wauplin/gradio-user-history/discussions
17
  """
18
+ import hashlib
19
  import json
20
  import os
21
  import shutil
 
38
  user_history.folder_path = _resolve_folder_path(folder_path)
39
  user_history.initialized = True
40
 
41
+ # Clean duplicates
42
+ try:
43
+ _clean_duplicates()
44
+ except Exception as e:
45
+ print(f"Failed to clean duplicates: {e}")
46
 
47
 
48
  def render() -> None:
 
50
 
51
  # initialize with default config
52
  if not user_history.initialized:
53
+ print("Initializing user history with default config. Use `user_history.setup(...)` to customize folder_path.")
 
 
54
  setup()
55
 
56
  # Render user history tab
 
85
 
86
  # "Export zip" row (hidden by default)
87
  with gr.Row():
88
+ export_file = gr.File(file_count="single", file_types=[".zip"], label="Exported history", visible=False)
 
 
 
 
 
89
 
90
  # "Config deletion" row (hidden by default)
91
  with gr.Row():
92
+ confirm_button = gr.Button("Confirm delete all history", variant="stop", visible=False)
 
 
93
  cancel_button = gr.Button("Cancel", visible=False)
94
 
95
  # Gallery
 
112
  gallery.attach_load_event(_fetch_user_history, every=None)
113
 
114
  # Interactions
115
+ refresh_button.click(fn=_fetch_user_history, inputs=[], outputs=[gallery], queue=False)
116
+ export_button.click(fn=_export_user_history, inputs=[], outputs=[export_file], queue=False)
 
 
 
 
117
 
118
  # Taken from https://github.com/gradio-app/gradio/issues/3324#issuecomment-1446382045
119
  delete_button.click(
 
194
 
195
  def _user_lock(self, username: str) -> FileLock:
196
  """Ensure history is not corrupted if concurrent calls."""
197
+ return FileLock(self.folder_path / f"{username}.lock") # lock outside of folder => better when exporting ZIP
 
 
198
 
199
  def _user_jsonl_path(self, username: str) -> Path:
200
  return self._user_path(username) / "history.jsonl"
 
214
 
215
  user_history = _UserHistory()
216
  if not user_history.initialized:
217
+ warnings.warn("User history is not set in Gradio demo. You must use `user_history.render(...)` first.")
 
 
218
  return []
219
 
220
  with user_history._user_lock(username):
 
240
 
241
  user_history = _UserHistory()
242
  if not user_history.initialized:
243
+ warnings.warn("User history is not set in Gradio demo. You must use `user_history.render(...)` first.")
 
 
244
  return None
245
 
246
  # Zip history
247
  with user_history._user_lock(username):
248
  path = shutil.make_archive(
249
+ str(_archives_path() / f"history_{username}"), "zip", user_history._user_path(username)
 
 
250
  )
251
 
252
  return gr.update(visible=True, value=path)
 
261
 
262
  user_history = _UserHistory()
263
  if not user_history.initialized:
264
+ warnings.warn("User history is not set in Gradio demo. You must use `user_history.render(...)` first.")
 
 
265
  return
266
 
267
  with user_history._user_lock(username):
 
298
  if folder_path is not None:
299
  return Path(folder_path).expanduser().resolve()
300
 
301
+ if os.getenv("SYSTEM") == "spaces" and os.path.exists("/data"): # Persistent storage is enabled!
 
 
302
  return Path("/data") / "_user_history"
303
 
304
  # Not in a Space or Persistent storage not enabled => local folder
 
359
  user_history = _UserHistory()
360
  if not user_history.initialized:
361
  return 0
362
+ if user_history.folder_path is not None and user_history.folder_path.exists():
363
+ return len([path for path in user_history.folder_path.iterdir() if path.is_dir()])
 
 
364
  return 0
365
 
366
 
 
368
  user_history = _UserHistory()
369
  if not user_history.initialized:
370
  return 0
371
+ if user_history.folder_path is not None and user_history.folder_path.exists():
372
  return len([path for path in user_history.folder_path.glob("*/images/*")])
373
  return 0
374
 
 
402
 
403
 
404
  def _get_disk_usage(path: Path) -> Tuple[int, int, int]:
405
+ for path in [path] + list(path.parents): # first check target_dir, then each parents one by one
 
 
406
  try:
407
  return shutil.disk_usage(path)
408
+ except OSError: # if doesn't exist or can't read => fail silently and try parent one
 
 
409
  pass
410
  return 0, 0, 0
411
 
 
424
  # Running in Space => try to fetch organization members
425
  # Otherwise, it's not an organization => namespace is the user
426
  namespace = space_id.split("/")[0]
427
+ response = requests.get(f"https://huggingface.co/api/organizations/{namespace}/members")
 
 
428
  if response.status_code == 200:
429
+ return sorted((member["user"] for member in response.json()), key=lambda x: x.lower())
 
 
430
  return [namespace]
431
 
432
 
433
+ #######
434
+ #######
 
 
435
 
436
+ # TODO: remove this once from IllusionDiffusion once cleaned
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
 
439
+ def _clean_duplicates() -> None:
440
+ user_history = _UserHistory()
441
+ if not (user_history.initialized and user_history.folder_path.exists()):
442
+ # Must be initialized correctly
443
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
+ _lock = user_history.folder_path / "_clean_duplicates.lock"
446
+ _is_done_file = user_history.folder_path / "_clean_duplicates_is_done" # Only 1 replica will do it, once for all
447
+
448
+ with FileLock(_lock):
449
+ if _is_done_file.exists(): # if True, another replica already did it
450
+ return
451
+
452
+ for subpath in user_history.folder_path.iterdir():
453
+ if subpath.is_file():
454
+ continue
455
+
456
+ history_file = subpath / "history.jsonl"
457
+ if not history_file.exists():
458
+ continue
459
+
460
+ # Read history
461
+ images = [json.loads(line) for line in history_file.read_text().splitlines()]
462
+
463
+ # Select unique images
464
+ curated_images = []
465
+ seen_hashes = set()
466
+ seen_paths = set()
467
+ for image in images:
468
+ image_hash = _file_hash(Path(image["path"]))
469
+ if image_hash is None:
470
+ continue
471
+ if image_hash in seen_hashes:
472
+ continue
473
+ seen_hashes.add(image_hash)
474
+ seen_paths.add(Path(image["path"]))
475
+ curated_images.append(image)
476
+
477
+ # Remove duplicates + save history
478
+ for path in subpath.glob("images/*"):
479
+ if path not in seen_paths:
480
+ try:
481
+ path.unlink()
482
+ except OSError:
483
+ pass
484
+ history_file.write_text("\n".join(json.dumps(image) for image in curated_images))
485
+
486
+ _is_done_file.touch()
487
+
488
+
489
+ def _file_hash(path: Path) -> str | None:
490
+ """Return the hash of a file. No need to read by chunks."""
491
+ if path.is_file():
492
+ return hashlib.md5(path.read_bytes()).hexdigest()
493
+ return None