c32's blog
每天学习一点点,每天进步一点点.
-
freebuf爬虫+写数据库+写本地文件
2016-08-04 23:21:00 | 分类: Python | 标签: 无 阅读(2884) 评论(0) #!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author : c32 (amd5@qq.com)
# @Blog : http://cx7863.blog.163.com/
# @Version :
# @DateTime: 2016-08-04 22:18:52
import urllib2
import re
import os
import MySQLdb
conn = MySQLdb.connect(host='192.168.1.10', user='root',
passwd='test', db='test', port = 3306, charset = 'utf8')
def getHtml(url): #获取html源码
page = urllib2.urlopen(url)
html = page.read()
return html
def urlPages(page): #翻页
url = 'http://www.freebuf.com/articles/page/' + str(page)
#print url
return url
def findList(html): #正则匹配列表
myItems = re.findall('<a target="_blank" href="(.*?)"><img calss="img-responsive" src="(.*?)" title="(.*?)" alt=', html, re.S)
return myItems
print myItems
print "11111111"
for page in range(1, 2+1): #抓取的页数
html = getHtml(urlPages(page))
items = findList(html)
for item in items:
s = item[0] +' '+ item[2] + '\n'
# print item[0] #正则匹配的第一项
file_object = open('list.txt', 'a')
#file_object.write(s) #写到本地文本
file_object.close()
print item[0] #正则匹配的第二项
# print html #读出源代码
# print item #匹配正则后的结果
#在此处增加批量写入到本地html文件,正在测试中。
cur = conn.cursor()
# cur.execute('insert into python(url,title) values(%s,%s)',(item[0],item[1]))
cur.execute('INSERT INTO python (url, title) VALUES (%s, %s)',(item[0],item[2]))
conn.commit()
# print 'success connect'
# print 'sqltemp'
cur.close()
conn.close()