philipp-zettl commited on
Commit
5e38ce5
1 Parent(s): ca98a40

Update src/text.py

Browse files
Files changed (1) hide show
  1. src/text.py +5 -1
src/text.py CHANGED
@@ -100,7 +100,11 @@ def doctree_from_url(url, elem_class='div', class_name='article-body'):
100
  article = extract_article(url)
101
  # convert to MD to handle splitting better
102
  article_content = select_content(article.html, elem_class, class_name)
103
- article_content = (f"# {article.title}\n\n" + article_content).replace('\n\n', '\n').replace('#', '%%@@%%')
 
 
 
 
104
  # fix relative website links
105
  article_content = fix_relative_links(url, article_content)
106
  # convert back to HTML
 
100
  article = extract_article(url)
101
  # convert to MD to handle splitting better
102
  article_content = select_content(article.html, elem_class, class_name)
103
+ requires_title = not any(filter(lambda x: x.startswith('# '), article_content.split('\n')))
104
+
105
+ if requires_title:
106
+ article_content = f"# {article.title}\n\n{article_content}"
107
+ article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
108
  # fix relative website links
109
  article_content = fix_relative_links(url, article_content)
110
  # convert back to HTML