1
0

get_all_links.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python
  2. from __future__ import print_function
  3. from socket import timeout
  4. import os
  5. import sys
  6. import codecs
  7. import re
  8. import markdown
  9. try:
  10. # compatible for python2
  11. from urllib2 import urlopen
  12. from urllib2 import HTTPError
  13. from urllib2 import URLError
  14. except ImportError:
  15. # compatible for python3
  16. from urllib.request import urlopen
  17. from urllib.error import HTTPError
  18. from urllib.error import URLError
  19. def check_live_url(url):
  20. result = False
  21. try:
  22. ret = urlopen(url, timeout=2)
  23. result = (ret.code == 200)
  24. except HTTPError as e:
  25. print(e, file=sys.stderr)
  26. except URLError as e:
  27. print(e, file=sys.stderr)
  28. except timeout as e:
  29. print(e, file=sys.stderr)
  30. except Exception as e:
  31. print(e, file=sys.stderr)
  32. return result
  33. def main(path):
  34. filenames = []
  35. for (dirpath, dnames, fnames) in os.walk(path):
  36. for fname in fnames:
  37. if fname.endswith('.md'):
  38. filenames.append(os.sep.join([dirpath, fname]))
  39. urls = []
  40. for filename in filenames:
  41. fd = codecs.open(filename, mode="r", encoding="utf-8")
  42. for line in fd.readlines():
  43. refs = re.findall(r'(?<=<a href=")[^"]*', markdown.markdown(line))
  44. for ref in refs:
  45. if ref not in urls:
  46. urls.append(ref)
  47. fd.close()
  48. for url in urls:
  49. if not url.startswith("http"):
  50. print("markdown file name: " + url)
  51. continue
  52. if check_live_url(url):
  53. print(url)
  54. else:
  55. print(url, file=sys.stderr)
  56. if __name__ == '__main__':
  57. if len(sys.argv) == 2:
  58. main(sys.argv[1])
  59. else:
  60. print("Choose one path as argument one")