web_crawler_mapreduce.py 494 B

12345678910111213141516171819202122232425
  1. # -*- coding: utf-8 -*-
  2. from mrjob.job import MRJob
  3. class RemoveDuplicateUrls(MRJob):
  4. def mapper(self, _, line):
  5. yield line, 1
  6. def reducer(self, key, values):
  7. total = sum(values)
  8. if total == 1:
  9. yield key, total
  10. def steps(self):
  11. """Run the map and reduce steps."""
  12. return [
  13. self.mr(mapper=self.mapper,
  14. reducer=self.reducer)
  15. ]
  16. if __name__ == '__main__':
  17. RemoveDuplicateUrls.run()