web_crawler_snippets.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # -*- coding: utf-8 -*-
  2. class PagesDataStore(object):
  3. def __init__(self, db):
  4. self.db = db
  5. pass
  6. def add_link_to_crawl(self, url):
  7. """Add the given link to `links_to_crawl`."""
  8. pass
  9. def remove_link_to_crawl(self, url):
  10. """Remove the given link from `links_to_crawl`."""
  11. pass
  12. def reduce_priority_link_to_crawl(self, url):
  13. """Reduce the priority of a link in `links_to_crawl` to avoid cycles."""
  14. pass
  15. def extract_max_priority_page(self):
  16. """Return the highest priority link in `links_to_crawl`."""
  17. pass
  18. def insert_crawled_link(self, url, signature):
  19. """Add the given link to `crawled_links`."""
  20. pass
  21. def crawled_similar(self, signature):
  22. """Determine if we've already crawled a page matching the given signature"""
  23. pass
  24. class Page(object):
  25. def __init__(self, url, contents, child_urls):
  26. self.url = url
  27. self.contents = contents
  28. self.child_urls = child_urls
  29. self.signature = self.create_signature()
  30. def create_signature(self):
  31. # Create signature based on url and contents
  32. pass
  33. class Crawler(object):
  34. def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue):
  35. self.pages = pages
  36. self.data_store = data_store
  37. self.reverse_index_queue = reverse_index_queue
  38. self.doc_index_queue = doc_index_queue
  39. def crawl_page(self, page):
  40. for url in page.child_urls:
  41. self.data_store.add_link_to_crawl(url)
  42. self.reverse_index_queue.generate(page)
  43. self.doc_index_queue.generate(page)
  44. self.data_store.remove_link_to_crawl(page.url)
  45. self.data_store.insert_crawled_link(page.url, page.signature)
  46. def crawl(self):
  47. while True:
  48. page = self.data_store.extract_max_priority_page()
  49. if page is None:
  50. break
  51. if self.data_store.crawled_similar(page.signature):
  52. self.data_store.reduce_priority_link_to_crawl(page.url)
  53. else:
  54. self.crawl_page(page)
  55. page = self.data_store.extract_max_priority_page()