pastebin.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # -*- coding: utf-8 -*-
  2. from mrjob.job import MRJob
  3. class HitCounts(MRJob):
  4. def extract_url(self, line):
  5. """Extract the generated url from the log line."""
  6. pass
  7. def extract_year_month(self, line):
  8. """Return the year and month portions of the timestamp."""
  9. pass
  10. def mapper(self, _, line):
  11. """Parse each log line, extract and transform relevant lines.
  12. Emit key value pairs of the form:
  13. (2016-01, url0), 1
  14. (2016-01, url0), 1
  15. (2016-01, url1), 1
  16. """
  17. url = self.extract_url(line)
  18. period = self.extract_year_month(line)
  19. yield (period, url), 1
  20. def reducer(self, key, values):
  21. """Sum values for each key.
  22. (2016-01, url0), 2
  23. (2016-01, url1), 1
  24. """
  25. yield key, sum(values)
  26. def steps(self):
  27. """Run the map and reduce steps."""
  28. return [
  29. self.mr(mapper=self.mapper,
  30. reducer=self.reducer)
  31. ]
  32. if __name__ == '__main__':
  33. HitCounts.run()