tools/scripts/web/conversion_scripts/cleanup.py - platform/external/cldr - Git at Google

 import re
 import requests
 import urllib.parse
 import unittest
 from unittest.mock import patch

 #head to place at start of all relative links
 RELATIVE_LINK_HEAD = "https://cldr.unicode.org"

 #sometimes the html --> md conversion puts extra spaces between bullets
 def fixBullets(content):
     #remove extra spaces after dash in bullet points
     content = re.sub(r'-\s{3}', '- ', content)
     #remove extra space after numbered bullet points
     content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content)
     #process lines for list handling
     processed_lines = []
     in_list = False
     for line in content.splitlines():
         if re.match(r'^\s*[-\d]', line):
             #check if the current line is part of a list
             in_list = True
         elif in_list and not line.strip():
             #skip empty lines within lists
             continue
         else:
             in_list = False
         processed_lines.append(line)
     processed_content = '\n'.join(processed_lines)

     return processed_content

 #html-->md conversion puts link headings into md and messes up titles
 def fixTitles(content):
     #link headings regex
     pattern = re.compile(r'(#+)\s*\n*\[\n*\]\(#.*\)\n(.*)\n*')

     #replace matched groups
     def replaceUnwanted(match):
         heading_level = match.group(1)  #heading level (ex. ##)
         title_text = match.group(2).strip()  #capture and strip the title text
         return f"{heading_level} {title_text}"  #return the formatted heading and title on the same line

     # Replace the unwanted text using the defined pattern and function
     processed_content = re.sub(pattern, replaceUnwanted, content)
     return processed_content

 # add title at top and unicode copyright at bottom
 def addHeaderAndFooter(content):
     #get title from top of md file
     title_match = re.search(r'(?<=#\s).*', content)
     if title_match:
         title = title_match.group(0).strip()
     else:
         title = "Default Title"  #default if couldnt find

     #header
     header = f"---\ntitle: {title}\n---\n"
     #footer
     footer = "\n![Unicode copyright](https://www.unicode.org/img/hb_notice.gif)\n"

     #look for existing title and copywrite in the YAML front matter
     title_exists = re.search(r'^---\n.*title:.*\n---', content, re.MULTILINE)
     footer_exists = footer.strip() in content

     #add header
     if not title_exists:
         content = header + content

     #add footer
     if not footer_exists:
         content = content + footer

     return content

 #html-->md sometimes produces double bullets on indented lists
 def fixIndentedBullets(content):
     #regex pattern to match the double hyphen bullets
     pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE)

     #split into lines
     lines = content.split('\n')

     #normalize bullets
     normalized_lines = []
     in_list = False

     for line in lines:
         #lines with double hyphens
         match = pattern.match(line)
         if match:
             #normalize the double hyphen bullet
             bullet_point = match.group(1)
             normalized_lines.append(f'- {bullet_point.strip()}')
             in_list = True
         elif in_list and re.match(r'^\s*-\s', line):
             #remove indentation from following bullets in the same list
             normalized_lines.append(line.strip())
         else:
             normalized_lines.append(line)
             in_list = False

     #join back into a single string
     processed_content = '\n'.join(normalized_lines)
     return processed_content

 #links on text that is already a link
 def removeRedundantLinks(content):
     #(link)[link] regex pattern
     link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]')

     #function to process unwanted links
     def replace_link(match):
         return match.group(1)  #return only the first URL

     #replace the links
     processed_content = re.sub(link_pattern, replace_link, content)
     return processed_content

 #process links, google redirects, normal redirects, and relative links (takes in a url)
 def convertLink(url):
     #relative links
     if url.startswith("/"):
         return RELATIVE_LINK_HEAD + url
     #google redirect links
     elif "www.google.com/url" in url:
         parsed_url = urllib.parse.urlparse(url)
         query_params = urllib.parse.parse_qs(parsed_url.query)
         if 'q' in query_params:
             return query_params['q'][0]
         return url
     #redirects
     else:
         try:
             response = requests.get(url)
             return response.url
         except requests.RequestException as e:
             print(f"Error following redirects for {url}: {e}")
             return url

 #finds all links and runs them through converLink
 def process_links(content):
     #regex pattern for md links
     pattern = re.compile(r'\[(.*?)\]\((.*?)\)')

     #replace each link
     def replace_link(match):
         text = match.group(1)
         url = match.group(2)
         new_url = convertLink(url)
         return f'[{text}]({new_url})'

     return pattern.sub(replace_link, content)

 #given a file path to an md file, run it through every cleanup function and write inot samle.md
 def fullCleanup(file_path):
     with open(file_path, 'r') as file:
         content = file.read()  # Read entire file as a string
     content = addHeaderAndFooter(content)
     content = fixTitles(content)
     content = fixBullets(content)
     content = removeRedundantLinks(content)
     content = fixIndentedBullets(content)
     content = process_links(content)
     with open("sample.md", 'w') as file:
         file.write(content)

 #given a md string, run through every cleanup function and return result
 def fullCleanupString(str):
     content = addHeaderAndFooter(str)
     content = fixTitles(content)
     content = fixBullets(content)
     content = removeRedundantLinks(content)
     content = fixIndentedBullets(content)
     content = process_links(content)
     return content


 #TESTS
 class TestMarkdownLinkProcessing(unittest.TestCase):
     def test_remove_redundant_links(self):
         #standard use cases
         markdown_content1 = '''
         redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox].
         not redundant link [example](https://www.example.com).
         '''
         expected_output1 = '''
         redundant link https://mail.google.com/mail/u/1/#inbox.
         not redundant link [example](https://www.example.com).
         '''
         self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1)

         #edge cases:
         #If the link does not start with http:// or https:// it will not be picked up as a link
         #if the two links are different, it does not get corrected
         markdown_content2 = '''
         not link [www.example.com](www.example.com).
         Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
         '''
         expected_output2 = '''
         not link [www.example.com](www.example.com).
         Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
         '''
         self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2)

     @patch('requests.get')
     def test_replace_links(self, mock_get):
         #mock responses for follow_redirects function
         def mock_get_response(url):
             class MockResponse:
                 def __init__(self, url):
                     self.url = url
             if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw':
                 return MockResponse('http://www.typolexikon.de/')
             elif url == 'http://www.example.com/':
                 return MockResponse('http://www.example.com/')
             return MockResponse(url)

         mock_get.side_effect = mock_get_response

         #standard use cases
         markdown_content1 = '''
         relative link [page](/relative-page).
         Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw).
         normal link [example.com](http://www.example.com/).
         '''
         expected_output1 = '''
         relative link [page](https://cldr.unicode.org/relative-page).
         Google redirect link [typolexikon.de](http://www.typolexikon.de/).
         normal link [example.com](http://www.example.com/).
         '''
         cleaned_content = removeRedundantLinks(markdown_content1)
         self.assertEqual(process_links(cleaned_content), expected_output1)

 if __name__ == '__main__':
     fullCleanup("testing.md")
     unittest.main()
	import re
	import requests
	import urllib.parse
	import unittest
	from unittest.mock import patch

	#head to place at start of all relative links
	RELATIVE_LINK_HEAD = "https://cldr.unicode.org"

	#sometimes the html --> md conversion puts extra spaces between bullets
	def fixBullets(content):
	#remove extra spaces after dash in bullet points
	content = re.sub(r'-\s{3}', '- ', content)
	#remove extra space after numbered bullet points
	content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content)
	#process lines for list handling
	processed_lines = []
	in_list = False
	for line in content.splitlines():
	if re.match(r'^\s*[-\d]', line):
	#check if the current line is part of a list
	in_list = True
	elif in_list and not line.strip():
	#skip empty lines within lists
	continue
	else:
	in_list = False
	processed_lines.append(line)
	processed_content = '\n'.join(processed_lines)

	return processed_content

	#html-->md conversion puts link headings into md and messes up titles
	def fixTitles(content):
	#link headings regex
	pattern = re.compile(r'(#+)\s\n\[\n\]\(#.\)\n(.)\n')

	#replace matched groups
	def replaceUnwanted(match):
	heading_level = match.group(1) #heading level (ex. ##)
	title_text = match.group(2).strip() #capture and strip the title text
	return f"{heading_level} {title_text}" #return the formatted heading and title on the same line

	# Replace the unwanted text using the defined pattern and function
	processed_content = re.sub(pattern, replaceUnwanted, content)
	return processed_content

	# add title at top and unicode copyright at bottom
	def addHeaderAndFooter(content):
	#get title from top of md file
	title_match = re.search(r'(?<=#\s).*', content)
	if title_match:
	title = title_match.group(0).strip()
	else:
	title = "Default Title" #default if couldnt find

	#header
	header = f"---\ntitle: {title}\n---\n"
	#footer
	footer = "\n![Unicode copyright](https://www.unicode.org/img/hb_notice.gif)\n"

	#look for existing title and copywrite in the YAML front matter
	title_exists = re.search(r'^---\n.title:.\n---', content, re.MULTILINE)
	footer_exists = footer.strip() in content

	#add header
	if not title_exists:
	content = header + content

	#add footer
	if not footer_exists:
	content = content + footer

	return content

	#html-->md sometimes produces double bullets on indented lists
	def fixIndentedBullets(content):
	#regex pattern to match the double hyphen bullets
	pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE)

	#split into lines
	lines = content.split('\n')

	#normalize bullets
	normalized_lines = []
	in_list = False

	for line in lines:
	#lines with double hyphens
	match = pattern.match(line)
	if match:
	#normalize the double hyphen bullet
	bullet_point = match.group(1)
	normalized_lines.append(f'- {bullet_point.strip()}')
	in_list = True
	elif in_list and re.match(r'^\s*-\s', line):
	#remove indentation from following bullets in the same list
	normalized_lines.append(line.strip())
	else:
	normalized_lines.append(line)
	in_list = False

	#join back into a single string
	processed_content = '\n'.join(normalized_lines)
	return processed_content

	#links on text that is already a link
	def removeRedundantLinks(content):
	#(link)[link] regex pattern
	link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]')

	#function to process unwanted links
	def replace_link(match):
	return match.group(1) #return only the first URL

	#replace the links
	processed_content = re.sub(link_pattern, replace_link, content)
	return processed_content

	#process links, google redirects, normal redirects, and relative links (takes in a url)
	def convertLink(url):
	#relative links
	if url.startswith("/"):
	return RELATIVE_LINK_HEAD + url
	#google redirect links
	elif "www.google.com/url" in url:
	parsed_url = urllib.parse.urlparse(url)
	query_params = urllib.parse.parse_qs(parsed_url.query)
	if 'q' in query_params:
	return query_params['q'][0]
	return url
	#redirects
	else:
	try:
	response = requests.get(url)
	return response.url
	except requests.RequestException as e:
	print(f"Error following redirects for {url}: {e}")
	return url

	#finds all links and runs them through converLink
	def process_links(content):
	#regex pattern for md links
	pattern = re.compile(r'\[(.?)\]\((.?)\)')

	#replace each link
	def replace_link(match):
	text = match.group(1)
	url = match.group(2)
	new_url = convertLink(url)
	return f'[{text}]({new_url})'

	return pattern.sub(replace_link, content)

	#given a file path to an md file, run it through every cleanup function and write inot samle.md
	def fullCleanup(file_path):
	with open(file_path, 'r') as file:
	content = file.read() # Read entire file as a string
	content = addHeaderAndFooter(content)
	content = fixTitles(content)
	content = fixBullets(content)
	content = removeRedundantLinks(content)
	content = fixIndentedBullets(content)
	content = process_links(content)
	with open("sample.md", 'w') as file:
	file.write(content)

	#given a md string, run through every cleanup function and return result
	def fullCleanupString(str):
	content = addHeaderAndFooter(str)
	content = fixTitles(content)
	content = fixBullets(content)
	content = removeRedundantLinks(content)
	content = fixIndentedBullets(content)
	content = process_links(content)
	return content


	#TESTS
	class TestMarkdownLinkProcessing(unittest.TestCase):
	def test_remove_redundant_links(self):
	#standard use cases
	markdown_content1 = '''
	redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox].
	not redundant link [example](https://www.example.com).
	'''
	expected_output1 = '''
	redundant link https://mail.google.com/mail/u/1/#inbox.
	not redundant link [example](https://www.example.com).
	'''
	self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1)

	#edge cases:
	#If the link does not start with http:// or https:// it will not be picked up as a link
	#if the two links are different, it does not get corrected
	markdown_content2 = '''
	not link [www.example.com](www.example.com).
	Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
	'''
	expected_output2 = '''
	not link [www.example.com](www.example.com).
	Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
	'''
	self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2)

	@patch('requests.get')
	def test_replace_links(self, mock_get):
	#mock responses for follow_redirects function
	def mock_get_response(url):
	class MockResponse:
	def __init__(self, url):
	self.url = url
	if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw':
	return MockResponse('http://www.typolexikon.de/')
	elif url == 'http://www.example.com/':
	return MockResponse('http://www.example.com/')
	return MockResponse(url)

	mock_get.side_effect = mock_get_response

	#standard use cases
	markdown_content1 = '''
	relative link [page](/relative-page).
	Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw).
	normal link [example.com](http://www.example.com/).
	'''
	expected_output1 = '''
	relative link [page](https://cldr.unicode.org/relative-page).
	Google redirect link [typolexikon.de](http://www.typolexikon.de/).
	normal link [example.com](http://www.example.com/).
	'''
	cleaned_content = removeRedundantLinks(markdown_content1)
	self.assertEqual(process_links(cleaned_content), expected_output1)

	if __name__ == '__main__':
	fullCleanup("testing.md")
	unittest.main()