blob: a606277b06e83393dc2c9214e38dc3a1fb101139 [file] [log] [blame]
import re
import requests
import urllib.parse
import unittest
from unittest.mock import patch
#head to place at start of all relative links
RELATIVE_LINK_HEAD = "https://cldr.unicode.org"
#sometimes the html --> md conversion puts extra spaces between bullets
def fixBullets(content):
#remove extra spaces after dash in bullet points
content = re.sub(r'-\s{3}', '- ', content)
#remove extra space after numbered bullet points
content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content)
#process lines for list handling
processed_lines = []
in_list = False
for line in content.splitlines():
if re.match(r'^\s*[-\d]', line):
#check if the current line is part of a list
in_list = True
elif in_list and not line.strip():
#skip empty lines within lists
continue
else:
in_list = False
processed_lines.append(line)
processed_content = '\n'.join(processed_lines)
return processed_content
#html-->md conversion puts link headings into md and messes up titles
def fixTitles(content):
#link headings regex
pattern = re.compile(r'(#+)\s*\n*\[\n*\]\(#.*\)\n(.*)\n*')
#replace matched groups
def replaceUnwanted(match):
heading_level = match.group(1) #heading level (ex. ##)
title_text = match.group(2).strip() #capture and strip the title text
return f"{heading_level} {title_text}" #return the formatted heading and title on the same line
# Replace the unwanted text using the defined pattern and function
processed_content = re.sub(pattern, replaceUnwanted, content)
return processed_content
# add title at top and unicode copyright at bottom
def addHeaderAndFooter(content):
#get title from top of md file
title_match = re.search(r'(?<=#\s).*', content)
if title_match:
title = title_match.group(0).strip()
else:
title = "Default Title" #default if couldnt find
#header
header = f"---\ntitle: {title}\n---\n"
#footer
footer = "\n![Unicode copyright](https://www.unicode.org/img/hb_notice.gif)\n"
#look for existing title and copywrite in the YAML front matter
title_exists = re.search(r'^---\n.*title:.*\n---', content, re.MULTILINE)
footer_exists = footer.strip() in content
#add header
if not title_exists:
content = header + content
#add footer
if not footer_exists:
content = content + footer
return content
#html-->md sometimes produces double bullets on indented lists
def fixIndentedBullets(content):
#regex pattern to match the double hyphen bullets
pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE)
#split into lines
lines = content.split('\n')
#normalize bullets
normalized_lines = []
in_list = False
for line in lines:
#lines with double hyphens
match = pattern.match(line)
if match:
#normalize the double hyphen bullet
bullet_point = match.group(1)
normalized_lines.append(f'- {bullet_point.strip()}')
in_list = True
elif in_list and re.match(r'^\s*-\s', line):
#remove indentation from following bullets in the same list
normalized_lines.append(line.strip())
else:
normalized_lines.append(line)
in_list = False
#join back into a single string
processed_content = '\n'.join(normalized_lines)
return processed_content
#links on text that is already a link
def removeRedundantLinks(content):
#(link)[link] regex pattern
link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]')
#function to process unwanted links
def replace_link(match):
return match.group(1) #return only the first URL
#replace the links
processed_content = re.sub(link_pattern, replace_link, content)
return processed_content
#process links, google redirects, normal redirects, and relative links (takes in a url)
def convertLink(url):
#relative links
if url.startswith("/"):
return RELATIVE_LINK_HEAD + url
#google redirect links
elif "www.google.com/url" in url:
parsed_url = urllib.parse.urlparse(url)
query_params = urllib.parse.parse_qs(parsed_url.query)
if 'q' in query_params:
return query_params['q'][0]
return url
#redirects
else:
try:
response = requests.get(url)
return response.url
except requests.RequestException as e:
print(f"Error following redirects for {url}: {e}")
return url
#finds all links and runs them through converLink
def process_links(content):
#regex pattern for md links
pattern = re.compile(r'\[(.*?)\]\((.*?)\)')
#replace each link
def replace_link(match):
text = match.group(1)
url = match.group(2)
new_url = convertLink(url)
return f'[{text}]({new_url})'
return pattern.sub(replace_link, content)
#given a file path to an md file, run it through every cleanup function and write inot samle.md
def fullCleanup(file_path):
with open(file_path, 'r') as file:
content = file.read() # Read entire file as a string
content = addHeaderAndFooter(content)
content = fixTitles(content)
content = fixBullets(content)
content = removeRedundantLinks(content)
content = fixIndentedBullets(content)
content = process_links(content)
with open("sample.md", 'w') as file:
file.write(content)
#given a md string, run through every cleanup function and return result
def fullCleanupString(str):
content = addHeaderAndFooter(str)
content = fixTitles(content)
content = fixBullets(content)
content = removeRedundantLinks(content)
content = fixIndentedBullets(content)
content = process_links(content)
return content
#TESTS
class TestMarkdownLinkProcessing(unittest.TestCase):
def test_remove_redundant_links(self):
#standard use cases
markdown_content1 = '''
redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox].
not redundant link [example](https://www.example.com).
'''
expected_output1 = '''
redundant link https://mail.google.com/mail/u/1/#inbox.
not redundant link [example](https://www.example.com).
'''
self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1)
#edge cases:
#If the link does not start with http:// or https:// it will not be picked up as a link
#if the two links are different, it does not get corrected
markdown_content2 = '''
not link [www.example.com](www.example.com).
Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
'''
expected_output2 = '''
not link [www.example.com](www.example.com).
Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
'''
self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2)
@patch('requests.get')
def test_replace_links(self, mock_get):
#mock responses for follow_redirects function
def mock_get_response(url):
class MockResponse:
def __init__(self, url):
self.url = url
if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw':
return MockResponse('http://www.typolexikon.de/')
elif url == 'http://www.example.com/':
return MockResponse('http://www.example.com/')
return MockResponse(url)
mock_get.side_effect = mock_get_response
#standard use cases
markdown_content1 = '''
relative link [page](/relative-page).
Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw).
normal link [example.com](http://www.example.com/).
'''
expected_output1 = '''
relative link [page](https://cldr.unicode.org/relative-page).
Google redirect link [typolexikon.de](http://www.typolexikon.de/).
normal link [example.com](http://www.example.com/).
'''
cleaned_content = removeRedundantLinks(markdown_content1)
self.assertEqual(process_links(cleaned_content), expected_output1)
if __name__ == '__main__':
fullCleanup("testing.md")
unittest.main()