| import re |
| import requests |
| import urllib.parse |
| import unittest |
| from unittest.mock import patch |
| |
| #head to place at start of all relative links |
| RELATIVE_LINK_HEAD = "https://cldr.unicode.org" |
| |
| #sometimes the html --> md conversion puts extra spaces between bullets |
| def fixBullets(content): |
| #remove extra spaces after dash in bullet points |
| content = re.sub(r'-\s{3}', '- ', content) |
| #remove extra space after numbered bullet points |
| content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content) |
| #process lines for list handling |
| processed_lines = [] |
| in_list = False |
| for line in content.splitlines(): |
| if re.match(r'^\s*[-\d]', line): |
| #check if the current line is part of a list |
| in_list = True |
| elif in_list and not line.strip(): |
| #skip empty lines within lists |
| continue |
| else: |
| in_list = False |
| processed_lines.append(line) |
| processed_content = '\n'.join(processed_lines) |
| |
| return processed_content |
| |
| #html-->md conversion puts link headings into md and messes up titles |
| def fixTitles(content): |
| #link headings regex |
| pattern = re.compile(r'(#+)\s*\n*\[\n*\]\(#.*\)\n(.*)\n*') |
| |
| #replace matched groups |
| def replaceUnwanted(match): |
| heading_level = match.group(1) #heading level (ex. ##) |
| title_text = match.group(2).strip() #capture and strip the title text |
| return f"{heading_level} {title_text}" #return the formatted heading and title on the same line |
| |
| # Replace the unwanted text using the defined pattern and function |
| processed_content = re.sub(pattern, replaceUnwanted, content) |
| return processed_content |
| |
| # add title at top and unicode copyright at bottom |
| def addHeaderAndFooter(content): |
| #get title from top of md file |
| title_match = re.search(r'(?<=#\s).*', content) |
| if title_match: |
| title = title_match.group(0).strip() |
| else: |
| title = "Default Title" #default if couldnt find |
| |
| #header |
| header = f"---\ntitle: {title}\n---\n" |
| #footer |
| footer = "\n\n" |
| |
| #look for existing title and copywrite in the YAML front matter |
| title_exists = re.search(r'^---\n.*title:.*\n---', content, re.MULTILINE) |
| footer_exists = footer.strip() in content |
| |
| #add header |
| if not title_exists: |
| content = header + content |
| |
| #add footer |
| if not footer_exists: |
| content = content + footer |
| |
| return content |
| |
| #html-->md sometimes produces double bullets on indented lists |
| def fixIndentedBullets(content): |
| #regex pattern to match the double hyphen bullets |
| pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE) |
| |
| #split into lines |
| lines = content.split('\n') |
| |
| #normalize bullets |
| normalized_lines = [] |
| in_list = False |
| |
| for line in lines: |
| #lines with double hyphens |
| match = pattern.match(line) |
| if match: |
| #normalize the double hyphen bullet |
| bullet_point = match.group(1) |
| normalized_lines.append(f'- {bullet_point.strip()}') |
| in_list = True |
| elif in_list and re.match(r'^\s*-\s', line): |
| #remove indentation from following bullets in the same list |
| normalized_lines.append(line.strip()) |
| else: |
| normalized_lines.append(line) |
| in_list = False |
| |
| #join back into a single string |
| processed_content = '\n'.join(normalized_lines) |
| return processed_content |
| |
| #links on text that is already a link |
| def removeRedundantLinks(content): |
| #(link)[link] regex pattern |
| link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]') |
| |
| #function to process unwanted links |
| def replace_link(match): |
| return match.group(1) #return only the first URL |
| |
| #replace the links |
| processed_content = re.sub(link_pattern, replace_link, content) |
| return processed_content |
| |
| #process links, google redirects, normal redirects, and relative links (takes in a url) |
| def convertLink(url): |
| #relative links |
| if url.startswith("/"): |
| return RELATIVE_LINK_HEAD + url |
| #google redirect links |
| elif "www.google.com/url" in url: |
| parsed_url = urllib.parse.urlparse(url) |
| query_params = urllib.parse.parse_qs(parsed_url.query) |
| if 'q' in query_params: |
| return query_params['q'][0] |
| return url |
| #redirects |
| else: |
| try: |
| response = requests.get(url) |
| return response.url |
| except requests.RequestException as e: |
| print(f"Error following redirects for {url}: {e}") |
| return url |
| |
| #finds all links and runs them through converLink |
| def process_links(content): |
| #regex pattern for md links |
| pattern = re.compile(r'\[(.*?)\]\((.*?)\)') |
| |
| #replace each link |
| def replace_link(match): |
| text = match.group(1) |
| url = match.group(2) |
| new_url = convertLink(url) |
| return f'[{text}]({new_url})' |
| |
| return pattern.sub(replace_link, content) |
| |
| #given a file path to an md file, run it through every cleanup function and write inot samle.md |
| def fullCleanup(file_path): |
| with open(file_path, 'r') as file: |
| content = file.read() # Read entire file as a string |
| content = addHeaderAndFooter(content) |
| content = fixTitles(content) |
| content = fixBullets(content) |
| content = removeRedundantLinks(content) |
| content = fixIndentedBullets(content) |
| content = process_links(content) |
| with open("sample.md", 'w') as file: |
| file.write(content) |
| |
| #given a md string, run through every cleanup function and return result |
| def fullCleanupString(str): |
| content = addHeaderAndFooter(str) |
| content = fixTitles(content) |
| content = fixBullets(content) |
| content = removeRedundantLinks(content) |
| content = fixIndentedBullets(content) |
| content = process_links(content) |
| return content |
| |
| |
| #TESTS |
| class TestMarkdownLinkProcessing(unittest.TestCase): |
| def test_remove_redundant_links(self): |
| #standard use cases |
| markdown_content1 = ''' |
| redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox]. |
| not redundant link [example](https://www.example.com). |
| ''' |
| expected_output1 = ''' |
| redundant link https://mail.google.com/mail/u/1/#inbox. |
| not redundant link [example](https://www.example.com). |
| ''' |
| self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1) |
| |
| #edge cases: |
| #If the link does not start with http:// or https:// it will not be picked up as a link |
| #if the two links are different, it does not get corrected |
| markdown_content2 = ''' |
| not link [www.example.com](www.example.com). |
| Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner]. |
| ''' |
| expected_output2 = ''' |
| not link [www.example.com](www.example.com). |
| Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner]. |
| ''' |
| self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2) |
| |
| @patch('requests.get') |
| def test_replace_links(self, mock_get): |
| #mock responses for follow_redirects function |
| def mock_get_response(url): |
| class MockResponse: |
| def __init__(self, url): |
| self.url = url |
| if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw': |
| return MockResponse('http://www.typolexikon.de/') |
| elif url == 'http://www.example.com/': |
| return MockResponse('http://www.example.com/') |
| return MockResponse(url) |
| |
| mock_get.side_effect = mock_get_response |
| |
| #standard use cases |
| markdown_content1 = ''' |
| relative link [page](/relative-page). |
| Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw). |
| normal link [example.com](http://www.example.com/). |
| ''' |
| expected_output1 = ''' |
| relative link [page](https://cldr.unicode.org/relative-page). |
| Google redirect link [typolexikon.de](http://www.typolexikon.de/). |
| normal link [example.com](http://www.example.com/). |
| ''' |
| cleaned_content = removeRedundantLinks(markdown_content1) |
| self.assertEqual(process_links(cleaned_content), expected_output1) |
| |
| if __name__ == '__main__': |
| fullCleanup("testing.md") |
| unittest.main() |