From bc12af422df102bde8ab0cc50d11984ddd7f2105 Mon Sep 17 00:00:00 2001 From: Sebastian Meragelman Date: Thu, 22 Sep 2022 07:47:44 -0300 Subject: [PATCH 1/2] Create mapper and reducer example files --- bigdata/src/C_H_mapper.py | 19 ++++++++++++++++++ bigdata/src/C_H_reducer.py | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 bigdata/src/C_H_mapper.py create mode 100644 bigdata/src/C_H_reducer.py diff --git a/bigdata/src/C_H_mapper.py b/bigdata/src/C_H_mapper.py new file mode 100644 index 0000000..ff4cb50 --- /dev/null +++ b/bigdata/src/C_H_mapper.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +"""mapper.py""" + +import sys + +# input comes from STDIN (standard input) +for line in sys.stdin: + # remove leading and trailing whitespace + line = line.strip() + # split the line into words + words = line.split() + # increase counters + for word in words: + # write the results to STDOUT (standard output); + # what we output here will be the input for the + # Reduce step, i.e. the input for reducer.py + # + # tab-delimited; the trivial word count is 1 + print ('%s\t%s' % (word, 1)) \ No newline at end of file diff --git a/bigdata/src/C_H_reducer.py b/bigdata/src/C_H_reducer.py new file mode 100644 index 0000000..d8a05bb --- /dev/null +++ b/bigdata/src/C_H_reducer.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +"""reducer.py""" + +from operator import itemgetter +import sys + +current_word = None +current_count = 0 +word = None + +# input comes from STDIN +for line in sys.stdin: + # remove leading and trailing whitespace + line = line.strip() + + # parse the input we got from mapper.py + word, count = line.split('\t', 1) + + # convert count (currently a string) to int + try: + count = int(count) + except ValueError: + # count was not a number, so silently + # ignore/discard this line + continue + + # this IF-switch only works because Hadoop sorts map output + # by key (here: word) before it is passed to the reducer + if current_word == word: + current_count += count + else: + if current_word: + # write result to STDOUT + print ('%s\t%s' % (current_word, current_count)) + current_count = count + current_word = word + +# do not forget to output the last word if needed! +if current_word == word: + print ('%s\t%s' % (current_word, current_count)) \ No newline at end of file From 8eac54bfb0ddab10ec8fe2dbe2959437189e9e8a Mon Sep 17 00:00:00 2001 From: Sebastian Meragelman Date: Thu, 22 Sep 2022 07:55:24 -0300 Subject: [PATCH 2/2] test input output working --- bigdata/src/C_H_mapper.py | 7 +++++-- bigdata/src/C_H_reducer.py | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bigdata/src/C_H_mapper.py b/bigdata/src/C_H_mapper.py index ff4cb50..8a527d5 100644 --- a/bigdata/src/C_H_mapper.py +++ b/bigdata/src/C_H_mapper.py @@ -1,13 +1,15 @@ #!/usr/bin/env python """mapper.py""" +# TEST FILE TO TEST INPUT OUTPUT import sys # input comes from STDIN (standard input) for line in sys.stdin: # remove leading and trailing whitespace - line = line.strip() + print("passed by mapper {}".format(line)) # split the line into words +''' words = line.split() # increase counters for word in words: @@ -16,4 +18,5 @@ # Reduce step, i.e. the input for reducer.py # # tab-delimited; the trivial word count is 1 - print ('%s\t%s' % (word, 1)) \ No newline at end of file + print ('%s\t%s' % (word, 1)) +''' \ No newline at end of file diff --git a/bigdata/src/C_H_reducer.py b/bigdata/src/C_H_reducer.py index d8a05bb..02d281e 100644 --- a/bigdata/src/C_H_reducer.py +++ b/bigdata/src/C_H_reducer.py @@ -1,18 +1,26 @@ #!/usr/bin/env python """reducer.py""" +# TEST FILE TO CHECK HOW THE INPUT/OUTPUT WORKS + + from operator import itemgetter import sys +''' current_word = None current_count = 0 word = None - +''' # input comes from STDIN +i=0 for line in sys.stdin: # remove leading and trailing whitespace + i=+1 line = line.strip() - + print("THIS IS THE LINE NUMBER:{}",str(i)) + print(line) +''' # parse the input we got from mapper.py word, count = line.split('\t', 1) @@ -37,4 +45,5 @@ # do not forget to output the last word if needed! if current_word == word: - print ('%s\t%s' % (current_word, current_count)) \ No newline at end of file + print ('%s\t%s' % (current_word, current_count)) + ''' \ No newline at end of file