Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
0 |% o0 q6 R* P0 n- #!/usr/bin/env python: R+ H; o) |8 N
- # -*- encoding: utf-8 -*-
/ l; `0 P, M# y+ n - # Created on 2019-05-05 21:43:11
, M& v( _$ k6 E9 i. P; b! M - # Project: XiaoShuo
& u1 n, ~. h; Z0 n -
0 s* Y+ J) Q8 f5 ]# F' m, `* F - from pyspider.libs.base_handler import *
$ t1 G8 g- {1 u" k* |# ~. _ - import pymysql
5 Z/ J- i3 e3 E6 D" S, w7 G - import random4 D; `/ t, N# m7 r* _5 g
- import datetime
0 d& w. [5 n, c ?; G; v5 r - import urllib2,HTMLParser,re
/ s% N/ @- t( h- \' L( ? {: n - import os4 L) s" g* Y9 j+ @! _
- import sys& X4 b8 Z5 T5 v7 X9 i, s7 H
- import re
+ s; a) u4 ~* c4 S! e0 `7 p! S - import codecs
; C/ t2 f2 N6 \' A1 z) w) O - import requests/ E: C" j- _/ a+ G
- import json
! K/ h! ~# v/ u5 D - $ A o; ^: U( ~4 o, D5 ?9 [6 F
- class Handler(BaseHandler):# i8 T1 g, p1 E9 G! {$ _
- global Datos
$ s" ^2 q2 h, ?" T; e& O4 ?! W/ ? - global P_dir
' F7 C( J9 z# l" _ - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
9 o0 T( @' Y! G3 F6 m - global Datos7 S- j. N# }+ K' O9 G5 P+ P3 x4 @
- Datos = {}- b. ^9 A* T0 W: K6 `7 c/ r7 c8 r
- headers= {
, R9 q1 @6 v+ q0 U - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
0 c3 m% l; \: N' l( I5 O - 'Accept-Encoding':'gzip, deflate, sdch',/ s+ o! r, G6 |9 a
- 'Accept-Language':'zh-CN,zh;q=0.8',1 U$ P% H e4 S: ~8 l
- 'Cache-Control':'max-age=0',
8 x, i* _9 Z- E7 L - 'Connection':'keep-alive',
3 I5 B8 O. f( z6 o1 E - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
, _- Y' z1 z. I$ p5 U5 H1 H - }
8 j, \3 e. t0 w# ~% ? - crawl_config = {
8 a: t7 k! u0 i. E* E - 'headers' : headers,
5 \6 ~0 m1 _+ P. I* H3 O8 _# |) m8 y - 'timeout' : 300; D" I* D* Z, v( T8 v4 L* {
- }
8 a9 J% ?0 f$ i6 z3 M C( m - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
. _/ Q5 L' v* R* ? - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
1 o0 U+ Y% j4 K! G# F - try:& p, N2 h% o) f" M$ p
- cursor = db.cursor()
1 n& ?6 c$ ?/ p! O4 _. s( D - #注意此处字符串的占位符要加双引号"%s"/ J: T4 y# I0 b6 B8 l& G! i V' M
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
9 R) a5 u3 M3 ?' \$ g N: c1 ~ - # print(sql)
- r* u+ }! p% x D0 J8 k - cursor.execute(sql)
/ v9 \6 B% Z$ Q/ l - + N7 S8 O' ]5 F7 p% J( \
- #qid = cursor.lastrowid5 ~3 [% a+ E; w- g1 ~
- #print(qid)9 e+ e6 s9 R4 } l- O
-
0 L% N+ L0 \& P+ W9 x; U1 M - db.commit()
9 Y+ e' y# e' E0 P$ |' j& h - except Exception as err:
% t$ l+ O' ^: K& W# [! H; [ - print("Error %s for execute sql: %s" % (err, sql))- ~/ w* K! D/ N
- db.rollback()
5 Z. U; H- T! F: m- n - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):0 a) t4 {- |. ?0 R9 Y
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
, L3 [2 x, }) z - try:
3 @% {* `- u2 \ ?( W$ n; F, K - cursor = db.cursor()
u# w0 C: G+ s7 x& D - #注意此处字符串的占位符要加双引号"%s"
4 F0 \' j& L" _0 l* f$ ~ - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
6 m% C7 z1 q; p; C - # print(sql)
8 ?5 t2 J- w } - cursor.execute(sql)
3 ? V- J2 \4 V/ m- g - " x5 H* N- m- I+ l: q# ]0 }2 a# O
- #qid = cursor.lastrowid# L* D0 m* B3 o
- #print(qid)$ C. _8 o1 t' ^' I6 _9 M% T9 t
-
) [1 ]: K! D r7 D - db.commit()( u/ A$ S: F0 T% z: P
- except Exception as err:
$ {% ?: Z, H4 N+ L' P - print("Error %s for execute sql: %s" % (err, sql))' l! E8 D/ ^# j3 L) ?+ D1 ~
- db.rollback(): k5 X, f" R8 N5 ]9 {8 {+ D
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. B! b9 Q9 m! O6 F, `" n8 _2 B7 ` - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
. c$ F% `$ ^1 {6 b) \' f* h - try:
1 C) P/ B9 V. `3 I3 f - cursor = db.cursor()
% O+ f) ~; a" P0 r9 n - #注意此处字符串的占位符要加双引号"%s". x5 e p) ?' f1 r8 R
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);0 _; P5 L0 G% U4 _( O4 W
- print(sql)
! ?$ d% q$ C- R1 R" X - cursor.execute(sql)
2 ^& h- r# A/ @0 B3 P# W - print(cursor.lastrowid)
! G# F0 }, ]! @' z. y - db.commit()
& j- ~! z# y8 D3 ^; K( e - except Exception as err:; L: b9 x2 D8 M! x+ E6 ^2 y
- # except:) Z0 m5 ~; G2 U$ [
- # print('Failed')
! T* U, S. i- X - print("Error %s for execute sql: %s" % (err, sql)): k0 p! E" `* {7 E+ Q8 ]
- db.rollback(). n) M% Y7 {, {* }& h3 s
- 4 `1 x8 K5 z: k# `0 O
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
, `' ^% p6 r) d0 A - reload(sys)% i$ j* L( j1 c4 n- K, n$ n
- sys.setdefaultencoding("gbk")
! j5 K2 E+ ^, p4 ^ - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址5 m) ^ G3 m9 M3 _: T' d$ R/ c
- locoy_data = {
- S" K. P- M8 _% }! m5 x3 W+ L% o - 'my_u':'用户名', #后台用户名. C% U0 S# i D" Z( h
- 'my_p':'密码', #后台密码
7 U" e4 e" X, C8 W - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),5 @* I( z8 W) {: g6 `" H2 ~
- 'caid':Cater_Name.encode('gbk', 'ignore'),; r! P, Y# m8 y2 {
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
! @ G* h. n. u$ k- O6 @ - 'article':BookConte.encode('gbk', 'ignore'),
! Z2 g$ i2 s2 f Q, B" s$ h% y( Q - 'author':Book_author.encode('gbk', 'ignore'),: A8 h) c* L3 r, |8 T
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
% W1 Q1 s6 A* s+ Y) o3 |' Z - 'thumb':Book_img,7 P; |8 C) b' A9 o9 i
- 'content':Book_Introduction.encode('gbk', 'ignore'),
) _7 [4 [5 p+ F$ `* m3 j - 'abover':abover.encode('gbk', 'ignore')
0 n; c; N) w7 A( d' M$ t - }% Z0 S+ y6 e+ G- q
- res = requests.post(locoy_url, data=locoy_data)2 [1 ~! l8 w5 P( z) V
- print res.text1 }) x. e: F. |0 z& i
- print res.content7 Y# s. z+ d8 c. ]# u" ]+ ?
- # print Dsd8 |4 r; [# z3 {+ a3 i6 g/ P/ t
- return res: }' i8 m8 ?6 _% B4 Y. ?4 H$ ?
- * L9 W! A/ P5 b6 c/ ^' Y* q. j
- def __init__(self):* M) S* D% Y6 i" h% v
- self.base_url1 = 'https://www.****.cc/') C/ g# A9 C) Z( \
- self.base_url2 = '/'
2 s. Q* v3 j, T% m/ Z' b - self.CaterId = []
3 `7 n$ l% p1 r9 { - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']. P% k! E# }9 u N- t- X
- self.page_num = 12 f C+ b6 j* G+ H
- self.total_num = 200 % H6 }6 Q$ k7 \9 L' Z4 G
- : F+ _. D) \8 k+ i' M2 p* X
- @every(minutes=8 * 60)2 Y' ^' n& @3 \6 e6 \1 i# K3 i6 r
- def on_start(self):$ ?3 z2 r }& r# S- r+ `! U+ S9 c
- global Cater_Name/ C- p8 ]& H4 X' W. v
- Cater_Name = []* R- y8 S0 M- N6 z. a: r
- while self.page_num <= self.total_num: + R0 n% t9 S" n% i K d4 g0 f
- for self.CaterId in self.CaterIds:
2 s$ @6 z8 B. w/ @+ z, n5 N: Y - if self.CaterId == 'xuanhuan':
\! }1 x; g1 c7 v+ W - Cater_Name = '玄幻', U+ n# k( z; E+ v% t$ }4 t- y2 j
- if self.CaterId == 'wuxia':# M2 l r: M: K" l: N b. S
- Cater_Name = '武侠'
+ p' l1 e3 Z4 ?6 j5 e5 T- z- r. |7 T - if self.CaterId == 'lishi':
Q s5 y g9 P- p: g4 b* x - Cater_Name = '历史' ( q4 z! M8 v7 x1 S- O3 e
- if self.CaterId == 'yanqing':
% [* N5 U3 E2 M6 N, c0 N6 K1 E) K - Cater_Name = '都市'
* i R. _+ e* g9 r - if self.CaterId == 'nvsheng':
: B+ M/ ^/ b L+ z - Cater_Name = '都市' 4 S8 R2 N% B* E
- if self.CaterId == 'kehuan':* {1 v0 g3 c9 i2 x- w0 ~& M1 ?
- Cater_Name = '科幻' 4 {/ v# f; ]# q! }# W C
- if self.CaterId == 'kongbu':
3 u2 P* J* j$ x) R - Cater_Name = '游戏'
. r! T1 B, X! H5 i5 s0 [& p - print self.CaterId7 M# N3 a! R+ x3 ]
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
9 ?( r! m; z" X# P - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)) ]& G8 b+ H& ` J5 Z4 g
- self.page_num += 1
# G5 P& O7 A4 w1 z8 R6 a M3 v -
: q2 d1 O3 O# n. p - def list_Caterg(self, response):+ S% i D9 w ~$ X( U% i8 \4 F H
- Cater_Name = response.save
) F# ]# f, k3 N - for each in response.doc('.pic-list a[href^="http"]').items():3 E* ?0 g# q' h/ {
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)) Z2 K1 |( b$ ]
- 6 R" y2 r* l' i& z. m& G
- def list_Caterg_detail(self, response):% R0 i7 \2 U1 f3 A! K3 {
- Cater_Name = response.save6 | m; x# L. L' R
- # print Cater_Name
4 e; x6 X0 [$ W& C! R$ L - Bookname = response.doc('h1').text()
% z- C3 f5 t% o+ I) _ - print Bookname
$ k3 w3 _8 k+ {) ` \ - Book_author = response.doc('.authorname > a').text()4 z4 x' d$ Q; D+ b1 C7 p
- # print Book_author/ f+ P- [. Q" w, L# Q& n, h
- Book_Introduction = response.doc('.book-intro > div').text()
9 j- ]& [: ~( { - # print Book_Introduction7 w+ Q- {* y0 c Z. g s# F' z
- Book_Synopsis = response.doc('b').eq(1).text()# V& m/ ?4 D2 ~, M! [# ~+ u
- # print Book_Synopsis+ Q! c# Z. L2 J! D9 M- a
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]/ Z7 Q# I' W* E: z4 D& G
- # print Book_Palabras
F) }. L4 S# g0 ~( o2 o K& _3 M: L; q - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
4 V1 f- c- Q% E' k6 O, x- _ - # print BookIDs4 b+ T. w/ ~) [- J' K
- Book_Dates = str(datetime.datetime.now())
6 U/ l. C( b3 h - for imgs in response.doc('.bigpic > img[src^="http"]').items():- Q5 `( s! ] y* v s! S$ P; J8 M
- img = imgs.attr.src
: s4 g; x2 V2 Y - print img
; \+ K! [( S# l X - #小说封面下载
. Y1 e$ B7 d- z9 N, e/ u1 V& _6 B9 Q* A - extension = self.getExtension(img)+ ~5 v. Z3 K; z
- name = self.getname(img)
9 j7 y- M% @( T0 p/ ] - file_name = name + "." + extension
, N4 I- P. j4 e7 J! m1 m - imgDir = P_dir + name4 k$ m1 `+ I4 V) V+ ^: e, k; T
- Locaimg = imgDir + "/" + file_name
0 D' F8 R* v8 y# |& A - print Locaimg
2 P" |. J" I' V+ x" J' e# |) F& I - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地8 f' j& V/ j5 ^0 G7 a& I* k! V0 g
- print('attachment url is ' + img) #3 ^9 l. Y" @ P. h1 h S) i! C! ^
- Datos = {) R; ?& j4 l1 ~) { Q8 ?) b* t) t
- "Cater_Name":Cater_Name,1 J7 F) z* j% h5 d. U
- "Book_author":Book_author,
! B5 ~( p/ l0 y8 B7 E9 I# X) u4 Q - "Book_Introduction":Book_Introduction,9 t1 J% q3 t, W8 u
- "Book_Synopsis":Book_Synopsis,0 {( M( M: b* y) B9 ~! |
- "Book_Palabras":Book_Palabras,1 K. v( t& Q! C' d8 G
- "img":img,
5 e) W1 W/ A0 ?4 X# [9 W7 u% c - }) c( ]9 C+ J8 H. I! a
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
. h2 c2 m- P% J- r7 A: A - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():6 V4 d& o- z1 H, k* g5 J1 c
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
+ I& s& f- S3 |# l: a0 o) g - 1 ]9 h# A5 y$ A# u! u
- @config(age=8 * 60 * 60)
$ K( N# ^4 v1 l; ^& J - def index_page(self, response):
: F _$ g" R0 U) ?" R7 e! s# x - Datos = {
# V2 z# W8 ]2 y/ ], E3 | - "Cater_Name":response.save['Cater_Name'],
7 w% N3 u1 K- G4 ~ - "Book_author":response.save['Book_author'],7 A) C3 n$ P4 A/ ]8 w
- "Book_Introduction":response.save['Book_Introduction'],8 k# x9 D" g& l2 H/ X9 F- _( I8 h% O
- "Book_Synopsis":response.save['Book_Synopsis'],) p& K5 O) N( P0 E1 o
- "Book_Palabras":response.save['Book_Palabras'],# o& ]! B, C/ f% D/ Q
- "img":response.save['img'],
! M Q. E' _! f3 L" ^* T% q - }
a: k! `- `1 x! ?5 y. Y1 M& M - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
, F- `! S& H2 m1 q - # for each in response.doc('.chapter-list a[href^="http"]').items():
* s) b, m1 e) {! J1 T- Q' X- [ - self.crawl(each.attr.href, callback=self.detail_page,save=Datos). _- x- e5 h, d
- @config(priority=2)
2 ^( {1 [! k$ p6 ~& s - @catch_status_code_error
, U, n3 r0 E; P1 J - def detail_page(self, response):
8 m- X% N3 O/ H& P1 F - NewRe1 = u'哈书'
% V9 g% f+ ^( }! y1 P3 I - NewRe2 = u'huhjsd.CC'7 K' r( E8 B. }. |6 c7 x1 T8 [( R
- NewRe3 = r'^\\n\\n'6 ^: B" x9 K' h8 f
- NewRe5 = u'小说网'
, l4 i7 t1 w6 R: ~, W7 x - NewRe6 = u'fgdfgf'
) x+ a; U$ W8 O) B3 q& P. G - NewRe7 = u'fgfgf') f4 r+ Z* E$ c3 R5 Q" |
- NewRe8 = u'ffhgf'
) A n4 h3 ^* R8 E' ^" j - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'6 M% i: N1 m7 I M
- ReC1 = u'静思'1 l1 {) R: d+ x* X5 H
- ReC2 = u'aghgf.com'# T0 ^" u- P: r; N# c3 |
- ReC3 = u'aghgfh.com'
3 l: V1 x3 Y# @ - ReC4 = u''
% Q0 T5 D! @ U" F - ReC5 = u'文学网'( x7 W! L O, R! d7 V
- ReC6 = r'<BR>'
; S( c4 k; `5 c0 m1 Z. P - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
: m& n! D1 q- T8 a" K# W+ v9 F; T - print Bookname
$ n- S$ M9 m" r' X1 T' D% t3 n- z: V - Cater_Name = response.save['Cater_Name'] # 小说分类( q) u) g% z/ U
- Book_author = response.save['Book_author'] #小说作者( j- Y( y: P( c4 z5 j1 a
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
+ `/ A$ m7 C, @5 X# I - Book_Synopsis = response.save['Book_Synopsis'] #最近更新' U1 J( y m# [7 R; T4 j
- Book_Palabras = response.save['Book_Palabras'] #小说字数9 p/ ?8 ?5 s/ C/ l: W; s" n
- Bookurl = response.url #小说网址
# E% b! i$ A) M$ s, @ - Booktitle = response.doc('.article-title').text() #章节名称! q- b+ L/ I/ C- S) H0 K0 O, m
- BookID = response.doc('.readset-r span').text() #小说ID& L4 t; w, b* G; B8 O4 `2 H9 O. J
- BookConte1 = response.doc('.article-con').text() #小说章节内容
y7 p6 z8 z, V% U# g8 N' y, }0 P0 P - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)' z2 N/ Q5 `* e( G
- Book_Date = str(datetime.datetime.now()) # 采集时间# h9 b& d. u1 n2 b
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)# T! R m {* m' o" u- A
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
+ l5 f: S) Q6 R! @# L - BookConte5 = BookConte3.replace(NewRe5 , ReC5)" c* J( B7 z! k- k. ~: V7 A. H
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)3 x+ _: c/ I( j K8 \) I1 D
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
; c# i6 r% n; O( { - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
7 M7 Q$ x- F" \" n) w - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)3 \4 \' M2 ]5 x$ K$ s( K: X' [
- BookConte = BookConte4.replace("\n\n","<br>")
- n5 y& @0 F0 \! P - print BookConte
' t, j3 ~1 o y( C - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
& S$ a+ z% ^$ M# m1 ~- S) `, A$ D5 t - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
/ T9 S4 b) l( g0 v6 Q; w" v! y - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)9 L0 K6 r0 y- u# ~
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)& N+ j7 O0 B4 m9 b2 B$ u: @
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] , J9 y1 r$ t6 P
- Book_img = response.save['img'], #小说图片
0 Y+ k5 Q. s. \1 {5 D$ e' @ - ! M0 e0 g* F: {; |
- #insert into MySQL 小说入库
- j! c5 Z8 t. k( z - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布/ A; ?; G' h- ^
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布. k3 J! m4 O# q
- #post提交发布
/ b: F, M1 o- v# K3 ~8 @ - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消6 _& x7 U( G, K6 @
- Datos = {
+ U' _; L( b9 p/ A - "Cater_Name":response.save['Cater_Name'],8 `9 m$ W% @4 s+ o5 E" J" p; n
- "Book_author":response.save['Book_author'],) u* l4 w: I( i
- "Book_Introduction":response.save['Book_Introduction'],8 @4 D: b3 m% ~! A0 a7 z
- "Book_Synopsis":response.save['Book_Synopsis'],
N8 Z' Z5 Z1 c% R* q; _; O - "Book_Palabras":response.save['Book_Palabras'],
3 [) D9 m4 B* K* r, Z - "img":response.save['img'],
6 k) K0 a9 y) @' B - }
0 [* T! p1 @0 J0 ^- B - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
9 `% m2 e0 b& i6 L0 O5 z6 B2 I - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
# g/ N$ K7 s! W8 I - return {
7 t$ x6 T* T3 p6 h# ?8 f' d, d - "Cater_Name":Cater_Name,
3 T; R: B4 x! \$ V - "Bookname":Bookname,/ d5 i) v' \' `& z: k; B( i
- "Book_author":Book_author,
# d$ I# |: q: Z - "Book_Introduction":Book_Introduction,4 i# x/ u4 l3 f7 y" q% H
- "Book_Synopsis":Book_Synopsis,
" r( v( l, S( b* y# c+ d - "Book_Palabras":Book_Palabras,
: @6 c7 J3 B1 ~( X, B7 g+ p8 l - "Book_img":Book_img,
9 D* I8 Z7 T: G" ~0 q - "Bookurl": response.url,
" q: w2 T* Z T8 f, A. H - "Booktitle": Booktitle,+ d+ |& g& Z( b- x! \( g
- "BookID": BookID, N& | \7 g7 p. r. y/ D
- "BookConte": BookConte,
% Q9 Q& o) w2 Q0 I7 z - "Titleid": Titleid,) e8 W l5 h9 i2 Y+ N+ ~* E/ n) t4 Z. K+ w
- "abover":abover,% | t! I+ H; d) j9 h B
- # "Book_Date" = str(datetime.datetime.now()),
. T$ M0 U; I5 R5 @! e - }" Z6 U% N; r; r; W) \2 v
- def download(self, P_dir, imgDir, file_name, Book_img):
! d* b$ P5 z% V( X$ I - if not os.path.exists(imgDir): ) q0 H- [- w) K9 \
- os.makedirs(imgDir) A. H A) ?+ Y0 F# c5 v: ^3 e
- file = imgDir + "/" + file_name
) X2 c0 X" h/ {( [7 Z& G - # print file$ w) h! g: U1 H {% Q7 f
- f = open(file, 'wb+')7 T7 H: B& M, R7 ?' @
- imag = requests.get(Book_img)
5 D _$ f( T+ C5 A - f.write(imag.content)
! @5 i9 t4 M% n" u - f.close()
9 [& r9 }' J8 b - #保存图片前
+ [2 R0 H7 R$ M/ t& u - def save_imgs(self,response):) q3 @3 e5 Y5 p& o7 P$ C+ Q" O
- content = response.content0 R4 z5 B; } S
- file_name = response.save["file_name"]
! A7 B; j2 d; Z6 \$ J$ d - imgDir = response.save["imgDir"]
- S9 Y5 g( c: C - file_path = imgDir + file_name! c. }# L# o& O
- self.save_img(content,imgDir,file_path)
# i; }6 d) c& r/ n - #保存图片
: d8 E5 l1 E4 Q2 m1 W - def save_img(self,content,imgDir,path):* i+ [ n% f1 `; R- l
- if not os.path.exists(imgDir):
?7 k6 N' E, E0 Z0 e - os.makedirs(imgDir)+ P% }+ p5 r2 n8 E- h
- f = open(path,"wb" )
1 D- Z8 N* [, g) V" b- U - f.write(content)4 i. A# ^5 R/ y, S) \0 f [9 S
- f.close()
# u5 q8 g7 o2 z i0 t5 }4 t9 E - #获取url后缀名# T9 K! v& g& {8 b$ h+ D
- def getExtension(self,url):
8 T/ |! y6 O/ g) l - extension = url.split(".")[-1]
; ]$ Q: [! f! A( k# F - return extension
6 V4 N- l4 k1 h/ N - + ~/ y6 @4 f! C" H5 E) E0 t
- #获取图片名 r B: \4 y" A) h3 Z% Z8 q
- def getname(self,url):
2 L6 J: @& a H - name=url.split("/")[-1].split(".")[0]
8 ~- s8 G4 @& C+ ~9 w - return name
复制代码
9 F$ p& U# D6 Y ; Y- ~2 a$ w( A: X3 y R; o3 I% P
|