small-hacks/verify-markdown-links.py

#!/usr/bin/python3
# Copyright © 2021, Bradley M. Kuhn
#  Also copyrighted by others as well (no notices shared), and it is 
# Licensed CC-BY-SA-4.0 because I borrowed an example from: https://www.py4u.net/discuss/208440
import sys
import re
from pathlib import Path
    
INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
FOOTNOTE_LINK_TEXT_RE = re.compile(r'\[([^\]]+)\]\[(\d+)\]')
FOOTNOTE_LINK_URL_RE = re.compile(r'\[(\d+)\]:\s+(\S+)')


def find_md_links(md):
    """ Return dict of links in markdown """

    links = list(INLINE_LINK_RE.findall(md))
    footnote_links = dict(FOOTNOTE_LINK_TEXT_RE.findall(md))
    footnote_urls = dict(FOOTNOTE_LINK_URL_RE.findall(md))

    for key in footnote_links.keys():
        links.append((footnote_links[key], footnote_urls[footnote_links[key]]))

    return links

body_markdown =  sys.stdin.read()

for (text, link) in find_md_links(body_markdown):
    errMsg = None
    if not re.match(r'^(mailto|http|ftp|#)', link, re.IGNORECASE):
        path = Path(link)
        if not path.is_file():
            errMsg = "local file by that name does not exist for relative file link"
    # FIXME: test other types of links
    if errMsg:
        print("Bad link of \"%s\" (labeled as \"%s\") has error: %s" % (link, text, errMsg))

exit(0)
# Below doesn't work, didn't have time to find out why


import markdown
from lxml import etree
import sys

body_markdown =  sys.stdin.read()
print(markdown.markdown(body_markdown))
doc = etree.fromstring(body)
for link in doc.xpath('//a'):
  print(link.text, link.get('href'))
First draft of script to verify links in Markdown file work The goal of this script is to go through a Markdown file and verify the links all work. Currently, only relative path links to files in the same directory are tested. 2021-10-16 17:18:13 -07:00			`#!/usr/bin/python3`
			`# Copyright © 2021, Bradley M. Kuhn`
			`# Also copyrighted by others as well (no notices shared), and it is`
			`# Licensed CC-BY-SA-4.0 because I borrowed an example from: https://www.py4u.net/discuss/208440`
			`import sys`
			`import re`
			`from pathlib import Path`

			`INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')`
			`FOOTNOTE_LINK_TEXT_RE = re.compile(r'\[([^\]]+)\]\[(\d+)\]')`
			`FOOTNOTE_LINK_URL_RE = re.compile(r'\[(\d+)\]:\s+(\S+)')`


			`def find_md_links(md):`
			`""" Return dict of links in markdown """`

			`links = list(INLINE_LINK_RE.findall(md))`
			`footnote_links = dict(FOOTNOTE_LINK_TEXT_RE.findall(md))`
			`footnote_urls = dict(FOOTNOTE_LINK_URL_RE.findall(md))`

			`for key in footnote_links.keys():`
			`links.append((footnote_links[key], footnote_urls[footnote_links[key]]))`

			`return links`

			`body_markdown = sys.stdin.read()`

			`for (text, link) in find_md_links(body_markdown):`
			`errMsg = None`
			`if not re.match(r'^(mailto\|http\|ftp\|#)', link, re.IGNORECASE):`
			`path = Path(link)`
			`if not path.is_file():`
			`errMsg = "local file by that name does not exist for relative file link"`
			`# FIXME: test other types of links`
			`if errMsg:`
			`print("Bad link of \"%s\" (labeled as \"%s\") has error: %s" % (link, text, errMsg))`

			`exit(0)`
			`# Below doesn't work, didn't have time to find out why`


			`import markdown`
			`from lxml import etree`
			`import sys`

			`body_markdown = sys.stdin.read()`
			`print(markdown.markdown(body_markdown))`
			`doc = etree.fromstring(body)`
			`for link in doc.xpath('//a'):`
			`print(link.text, link.get('href'))`