每天学习一点点,每天进步一点点.
login

python多队列数据挖掘网站-案例

2015-12-30 23:14:00  |  分类: Python |  标签: 无 阅读(2690)    评论(0)

先导入BeautifulSoup模块
http://c32.19aq.com/Python/Module/

  1. #!/usr/bin/python
  2. #-*- coding: utf-8 -*-
  3. # @Author : c32 (amd5@qq.com)
  4. # @Blog : http://cx7863.blog.163.com/
  5. # @Version :
  6. # @DateTime: 2015-12-30 23:31:15
  7. import Queue
  8. import threading
  9. import urllib2
  10. import time
  11. #from BeautifulSoup import BeautifulSoup
  12. from bs4 import BeautifulSoup
  13. hosts = ["http://www.baidu.com", "http://news.baidu.com/", "http://tieba.baidu.com/","http://zhidao.baidu.com/",
  14. "http://image.baidu.com/ ","http://music.baidu.com/","http://v.baidu.com/","http://map.baidu.com/","http://wenku.baidu.com/",
  15. "http://stu.baidu.com/","http://stu.baidu.com/","http://piao.baidu.com/","http://www.hao123.com/","http://fanyi.baidu.com/"]
  16. queue = Queue.Queue()
  17. out_queue = Queue.Queue()
  18. class ThreadUrl(threading.Thread):
  19. """Threaded Url Grab"""
  20. def __init__(self, queue, out_queue):
  21. threading.Thread.__init__(self)
  22. self.queue = queue
  23. self.out_queue = out_queue
  24. def run(self): #run 方法用于要执行的功能 #getName()用于获取线程名称
  25. while True:
  26. #从队列中获取一个任务
  27. host = self.queue.get()
  28. #抓取工作
  29. url = urllib2.urlopen(host)
  30. chunk = url.read()
  31. ##将chunk中的数据传给out_queue
  32. self.out_queue.put(chunk)
  33. #标记队列工作已完成
  34. self.queue.task_done()
  35. class DatamineThread(threading.Thread):
  36. """Threaded Url Grab"""
  37. def __init__(self, out_queue):
  38. threading.Thread.__init__(self)
  39. self.out_queue = out_queue
  40. def run(self):
  41. while True:
  42. #从队列中获取一个任务
  43. chunk = self.out_queue.get()
  44. #抓取工作
  45. soup = BeautifulSoup(chunk)
  46. print soup.findAll(['title'])
  47. #标记队列工作已完成
  48. self.out_queue.task_done()
  49. start = time.time()
  50. def main():
  51. #创建队列实例
  52. for i in range(5):
  53. t = ThreadUrl(queue, out_queue)
  54. t.setDaemon(True)#主程序退出时,子线程也立即退出
  55. t.start()#启动线程
  56. #向队列中填充数数
  57. for host in hosts:
  58. queue.put(host)
  59. for i in range(5):
  60. dt = DatamineThread(out_queue)
  61. dt.setDaemon(True)
  62. dt.start()
  63. #只到所有任务完成后,才退出主程序
  64. queue.join()
  65. out_queue.join()
  66. main()
  67. print "Elapsed Time: %s" % (time.time() - start)

原文参考 http://www.ibm.com/developerworks/cn/aix/library/au-threadingpython/

留言区域