Spaces:
Runtime error
Runtime error
Commit
•
fdc091a
1
Parent(s):
8ba4837
Adding html and topic modeling on subreddit
Browse files- requirements.txt +2 -1
- src/build_nomic.py +29 -5
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ tqdm==4.66.1
|
|
7 |
beautifulsoup4==4.12.2
|
8 |
lxml==4.9.3
|
9 |
rich==13.3.4
|
10 |
-
nomic==3.0.15
|
|
|
|
7 |
beautifulsoup4==4.12.2
|
8 |
lxml==4.9.3
|
9 |
rich==13.3.4
|
10 |
+
nomic==3.0.15
|
11 |
+
markdown==3.6
|
src/build_nomic.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
|
2 |
import os
|
3 |
-
import
|
4 |
import time
|
5 |
|
|
|
6 |
import nomic
|
7 |
-
from nomic import atlas
|
8 |
-
from nomic.dataset import AtlasClass
|
9 |
import numpy as np
|
|
|
|
|
|
|
|
|
10 |
|
11 |
from src.my_logger import setup_logger
|
12 |
|
@@ -20,6 +23,11 @@ def count_words(text):
|
|
20 |
return len(words)
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
|
|
23 |
def delete_old_nomic():
|
24 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
25 |
try:
|
@@ -32,11 +40,12 @@ def delete_old_nomic():
|
|
32 |
except:
|
33 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
34 |
|
|
|
35 |
def build_nomic(dataset):
|
36 |
df = dataset['train'].to_pandas()
|
37 |
|
38 |
-
non_embedding_columns = ['date_utc', 'title', 'flair', '
|
39 |
-
'score', 'score_percentile']
|
40 |
|
41 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
42 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
@@ -53,6 +62,20 @@ def build_nomic(dataset):
|
|
53 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
54 |
|
55 |
df['word_count'] = df['content'].apply(count_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
delete_old_nomic()
|
58 |
|
@@ -62,5 +85,6 @@ def build_nomic(dataset):
|
|
62 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
63 |
id_field='id',
|
64 |
identifier='BORU Subreddit Neural Search',
|
|
|
65 |
)
|
66 |
logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
|
|
|
1 |
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
|
2 |
import os
|
3 |
+
import re
|
4 |
import time
|
5 |
|
6 |
+
import markdown
|
7 |
import nomic
|
|
|
|
|
8 |
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
from nomic import atlas, Nomic
|
11 |
+
from nomic.dataset import AtlasClass
|
12 |
+
from nomic.data_inference import NomicTopicOptions
|
13 |
|
14 |
from src.my_logger import setup_logger
|
15 |
|
|
|
23 |
return len(words)
|
24 |
|
25 |
|
26 |
+
def convert_markdown_to_html(markdown_text):
|
27 |
+
html = markdown.markdown(markdown_text)
|
28 |
+
return html
|
29 |
+
|
30 |
+
|
31 |
def delete_old_nomic():
|
32 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
33 |
try:
|
|
|
40 |
except:
|
41 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
42 |
|
43 |
+
|
44 |
def build_nomic(dataset):
|
45 |
df = dataset['train'].to_pandas()
|
46 |
|
47 |
+
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
|
48 |
+
'score', 'score_percentile', 'html_content', 'subreddit']
|
49 |
|
50 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
51 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
|
|
62 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
63 |
|
64 |
df['word_count'] = df['content'].apply(count_words)
|
65 |
+
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
66 |
+
|
67 |
+
# Regex to extract subreddit
|
68 |
+
subreddit_re = re.compile(r'r/(\w+)')
|
69 |
+
def extract_subreddit(text):
|
70 |
+
match = subreddit_re.search(text)
|
71 |
+
if match:
|
72 |
+
return match.group(1)
|
73 |
+
return ''
|
74 |
+
|
75 |
+
# Apply the function
|
76 |
+
df['subreddit'] = df['content'].apply(extract_subreddit)
|
77 |
+
|
78 |
+
topic_options = NomicTopicOptions(build_topic_model=True, community_description_target_field='subreddit')
|
79 |
|
80 |
delete_old_nomic()
|
81 |
|
|
|
85 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
86 |
id_field='id',
|
87 |
identifier='BORU Subreddit Neural Search',
|
88 |
+
topic_model=topic_options
|
89 |
)
|
90 |
logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
|