c32's blog
每天学习一点点,每天进步一点点.
-
python多队列数据挖掘网站-案例
2015-12-30 23:14:00 | 分类: Python | 标签: 无 阅读(2690) 评论(0) #!/usr/bin/python
#-*- coding: utf-8 -*-
# @Author : c32 (amd5@qq.com)
# @Blog : http://cx7863.blog.163.com/
# @Version :
# @DateTime: 2015-12-30 23:31:15
import Queue
import threading
import urllib2
import time
#from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
hosts = ["http://www.baidu.com", "http://news.baidu.com/", "http://tieba.baidu.com/","http://zhidao.baidu.com/",
"http://image.baidu.com/ ","http://music.baidu.com/","http://v.baidu.com/","http://map.baidu.com/","http://wenku.baidu.com/",
"http://stu.baidu.com/","http://stu.baidu.com/","http://piao.baidu.com/","http://www.hao123.com/","http://fanyi.baidu.com/"]
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self): #run 方法用于要执行的功能 #getName()用于获取线程名称
while True:
#从队列中获取一个任务
host = self.queue.get()
#抓取工作
url = urllib2.urlopen(host)
chunk = url.read()
##将chunk中的数据传给out_queue
self.out_queue.put(chunk)
#标记队列工作已完成
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
#从队列中获取一个任务
chunk = self.out_queue.get()
#抓取工作
soup = BeautifulSoup(chunk)
print soup.findAll(['title'])
#标记队列工作已完成
self.out_queue.task_done()
start = time.time()
def main():
#创建队列实例
for i in range(5):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)#主程序退出时,子线程也立即退出
t.start()#启动线程
#向队列中填充数数
for host in hosts:
queue.put(host)
for i in range(5):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#只到所有任务完成后,才退出主程序
queue.join()
out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
先导入BeautifulSoup模块
http://c32.19aq.com/Python/Module/
原文参考 http://www.ibm.com/developerworks/cn/aix/library/au-threadingpython/