爬虫笔记

import requests

from pyquery import PyQuery as pq

import pymysql

import urllib.request


# 插入数据库

def insertsql(sql):

db = pymysql.connect(

host="localhost",

port=3306,

user='root',

password="admin123",

db="shici_org",

charset="utf8")

cursor = db.cursor()

# sql="insert into sc_chapter(title,content,dynasty_name,author_name,cat_name) values('"+title+"','"+content+"','"+dynasty_name+"','"+author_name+"','"+cat_name+"')"

cursor.execute(sql)

lastid = cursor.lastrowid

print(lastid)

db.close()

#查询

def dbselect(sql):

db = pymysql.connect(host="localhost",user="root",password="admin123",db="zuowen")


cursor = db.cursor()

lst=[]

try:

cursor.execute(sql)

#data = cursor.fetchone() //查询出一条

data = cursor.fetchall()

for row in data:

# print(data)

lst.append(row)

db.commit()

except:

print("插入失败,sql:"+sql)

db.rollback()

cursor.close()

db.close()


return lst


#抓取网页内容

header = {'content-type': 'application/json','User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}


url="https://so.gushiwen.cn/authors/"

t=requests.get(url=url,headers=header) # t = requests.get(url)


html = t.text

doc = pq(html)

items = doc(".sonspic").items()



#循环文件

path = os.listdir("./mizhi")

for f in path:

print(f)

cpath = os.listdir("./mizhi/"+f)

for cf in cpath:

filename = "data/mz/"+f+"/"+cf


#下载图片

urllib.request.urlretrieve("图片地址","图片路径/图片名称");


循环数组加索引

for index,item in enumerate(arr):