import sys
#输入为标准输出stdin
for line in sys.stdin:
#删除开头和结尾的空行
line = line.strip()
#以默认空格分隔单词到words列表
words = line.split()
for word in words:
#输出所有单词,格式为“单词,1”以便为reduce的输入
print '%s %s' % (word,1)
import sys
current_word = None
current_count = 0
word = None
#获取标准输入,即map.py的标准输出
for line in sys.stdin:
#删除开头和结尾的空行
line = line.strip()
#解析map.py输出作为程序的输入,以tab作为分隔符
word,count = line.split( )
#转换count从字符型到整型
try:
count = int(count)
except ValueError:
#count非数字时,忽略此行
continue
#要求map.py的输出做排序(sort)操作,以便对连续的word做判断
if current_word == word:
current_count += count
else :
#出现了一个新词
if current_word :
print '%s\t%s' % (current_word,current_count)
current_count = count
current_word = word
if current_word == word:
print '%s\t%s' % (current_word,current_count)
文本
foo foo puu labs foo puu abc bar see you by test test
abc labs foo me python hadoop ab ac ab bc bc python
运行
[root@alex ~]# cat ce.txt | python map.py | sort | python reduce.py
ab 2
abc 2
ac 1
bar 1
bc 2
by 1
foo 4
hadoop 1
labs 2
me 1
puu 2
python 2
see 1
test 2
you 1