每天学习一点点,每天进步一点点.
login

freebuf爬虫+写数据库+写本地文件

2016-08-04 23:21:00  |  分类: Python |  标签: 无 阅读(2884)    评论(0)
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # @Author : c32 (amd5@qq.com)
  4. # @Blog : http://cx7863.blog.163.com/
  5. # @Version :
  6. # @DateTime: 2016-08-04 22:18:52
  7. import urllib2
  8. import re
  9. import os
  10. import MySQLdb
  11. conn = MySQLdb.connect(host='192.168.1.10', user='root',
  12. passwd='test', db='test', port = 3306, charset = 'utf8')
  13. def getHtml(url): #获取html源码
  14. page = urllib2.urlopen(url)
  15. html = page.read()
  16. return html
  17. def urlPages(page): #翻页
  18. url = 'http://www.freebuf.com/articles/page/' + str(page)
  19. #print url
  20. return url
  21. def findList(html): #正则匹配列表
  22. myItems = re.findall('<a target="_blank" href="(.*?)"><img calss="img-responsive" src="(.*?)" title="(.*?)" alt=', html, re.S)
  23. return myItems
  24. print myItems
  25. print "11111111"
  26. for page in range(1, 2+1): #抓取的页数
  27. html = getHtml(urlPages(page))
  28. items = findList(html)
  29. for item in items:
  30. s = item[0] +' '+ item[2] + '\n'
  31. # print item[0] #正则匹配的第一项
  32. file_object = open('list.txt', 'a')
  33. #file_object.write(s) #写到本地文本
  34. file_object.close()
  35. print item[0] #正则匹配的第二项
  36. # print html #读出源代码
  37. # print item #匹配正则后的结果
  38. #在此处增加批量写入到本地html文件,正在测试中。
  39. cur = conn.cursor()
  40. # cur.execute('insert into python(url,title) values(%s,%s)',(item[0],item[1]))
  41. cur.execute('INSERT INTO python (url, title) VALUES (%s, %s)',(item[0],item[2]))
  42. conn.commit()
  43. # print 'success connect'
  44. # print 'sqltemp'
  45. cur.close()
  46. conn.close()
留言区域