首页 > 编程语言 > 详细

python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。

时间:2015-12-19 06:35:08      阅读:394      评论:0      收藏:0      [点我收藏+]
import sys
import tstree

fname = high_freq_site.list
tree = tstree.TernarySearchTrie()
tree.loadData(fname)

token = ‘‘
counter = 0
post = []

# url, count, posttime
for line in sys.stdin:
    line = line.strip()
    arr = line.split()
    if len(arr) != 3:
        continue
    
    #print arr
    num = arr[1]
    url = arr[0]
    posttime = int(arr[2])

    if token == ‘‘:
        token = url
        counter = 0
        counter += int(num)
        post.append(posttime)
    elif token == url:
        counter += int(num)
        post.append(posttime)
    elif token != url:
        ret = tree.maxMatch(token)
        if ret and post:
            print %s\t%s\t%s\t%s % (ret, token, counter, min(post))
        
        token = url
        counter = 0
        counter += int(num)
        post = []

ret = tree.maxMatch(token)
if ret and post:
    print %s\t%s\t%s\t%s % (ret, token, counter, min(post))



class TSTNode(object):
    def __init__(self, splitchar):
        self.splitchar = splitchar
        self.data = None

        self.loNode = None
        self.eqNode = None
        self.hiNode = None


class TernarySearchTrie(object):
    def __init__(self):
        self.rootNode = None


    def loadData(self, fname):
        f = open(fname)
        while True:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            node = self.addWord(line)
            if node:
                node.data = line
        f.close()

    
    def addWord(self, word):
        if not word:
            return None

        charIndex = 0
        if not self.rootNode:
            self.rootNode = TSTNode(word[0])

        currentNode = self.rootNode

        while True:
            charComp = ord(word[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if charIndex == len(word):
                    return currentNode
                if not currentNode.eqNode:
                    currentNode.eqNode = TSTNode(word[charIndex])
                currentNode = currentNode.eqNode
            elif charComp < 0:
                if not currentNode.loNode:
                    currentNode.loNode = TSTNode(word[charIndex])
                currentNode = currentNode.loNode
            else:
                if not currentNode.hiNode:
                    currentNode.hiNode = TSTNode(word[charIndex])
                currentNode = currentNode.hiNode


    def maxMatch(self, url):
        ret = None
        currentNode = self.rootNode
        charIndex = 0
        while currentNode:
            if charIndex >= len(url):
                break
            charComp = ord(url[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if currentNode.data:
                    ret = currentNode.data
                if charIndex == len(url):
                    return ret
                currentNode = currentNode.eqNode
            elif charComp < 0:
                currentNode = currentNode.loNode
            else:
                currentNode = currentNode.hiNode
        return ret


if __name__ == __main__:
    import sys
    fname = high_freq_site.list
    tree = TernarySearchTrie()
    tree.loadData(fname)

    for url in sys.stdin:
        url = url.strip()
        ret = tree.maxMatch(url)
        print ret

 

python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。

原文:http://www.cnblogs.com/i80386/p/5058584.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!