login每天学习一点点,每天进步一点点.
当前位置:首页 >> python多队列数据挖掘网站-案例

python多队列数据挖掘网站-案例

2015-12-30 23:14:00  |  分类: Python |  标签: 阅读(370)评论(0)
先导入BeautifulSoup模块

http://c32.19aq.com/Python/Module/

#!/usr/bin/python
#-*- coding: utf-8 -*-
# @Author  : c32 (amd5@qq.com)
# @Blog    : http://cx7863.blog.163.com/
# @Version : 
# @DateTime:  2015-12-30 23:31:15

import Queue
import threading
import urllib2
import time
#from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup

hosts = ["http://www.baidu.com", "http://news.baidu.com/", "http://tieba.baidu.com/","http://zhidao.baidu.com/",
"http://image.baidu.com/ ","http://music.baidu.com/","http://v.baidu.com/","http://map.baidu.com/","http://wenku.baidu.com/",
"http://stu.baidu.com/","http://stu.baidu.com/","http://piao.baidu.com/","http://www.hao123.com/","http://fanyi.baidu.com/"]

queue = Queue.Queue()
out_queue = Queue.Queue()

class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue, out_queue):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue

    def run(self):  #run 方法用于要执行的功能         #getName()用于获取线程名称
        while True:
            #从队列中获取一个任务
            host = self.queue.get()
            #抓取工作
            url = urllib2.urlopen(host)
            chunk = url.read()
            ##将chunk中的数据传给out_queue
            self.out_queue.put(chunk)

            #标记队列工作已完成
            self.queue.task_done()

class DatamineThread(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, out_queue):
        threading.Thread.__init__(self)
        self.out_queue = out_queue

    def run(self):
        while True:
            #从队列中获取一个任务
            chunk = self.out_queue.get()
            #抓取工作
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #标记队列工作已完成
            self.out_queue.task_done()

start = time.time()
def main():
	#创建队列实例
    for i in range(5):
        t = ThreadUrl(queue, out_queue)
        t.setDaemon(True)#主程序退出时,子线程也立即退出
        t.start()#启动线程

    #向队列中填充数数
    for host in hosts:
        queue.put(host)

    for i in range(5):
        dt = DatamineThread(out_queue)
        dt.setDaemon(True)
        dt.start()


    #只到所有任务完成后,才退出主程序
    queue.join()
    out_queue.join()

main()
print "Elapsed Time: %s" % (time.time() - start)



#原文参考 http://www.ibm.com/developerworks/cn/aix/library/au-threadingpython/

上一篇:ubuntu使用nmap查询端口 下一篇:ThinkPHP5 的视图$view->fetch()和$view->display()的区别

猜你喜欢

发表评论:

0.165728s