Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
; w# c3 P/ [6 b# u0 H# k+ n- #!/usr/bin/env python
$ P5 w4 L% o/ A4 _ - # -*- encoding: utf-8 -*-
5 s$ C4 l2 T3 l+ Z/ |- R - # Created on 2019-05-05 21:43:11$ [1 ^3 }) X# M7 t: h: H9 N
- # Project: XiaoShuo
7 _/ S$ z0 m9 i - " g+ v; e/ {5 V% f8 ]
- from pyspider.libs.base_handler import *
5 c7 b, q5 {' i4 ~2 V$ ~( C; i% E0 x - import pymysql
1 y/ y/ d1 z! T- {7 E" t - import random9 F; f% Q6 B( O: d( ?
- import datetime
/ V5 i9 W0 v+ r% V' V, c Q3 a - import urllib2,HTMLParser,re5 W0 c8 I. N B( s0 p5 \
- import os- @1 @9 `, j# D
- import sys1 s) K9 c: t& F: V/ i) J
- import re! @5 ^ M) P& U+ n0 E# \ [" H
- import codecs) ^) A# J4 a. C' b' N; J
- import requests8 o( M, g: t. {. J
- import json5 ~ Q+ g1 {/ W$ e2 k, t. O
-
, K7 |5 k0 L1 ^2 I: \ - class Handler(BaseHandler):
" @( W- e$ V' t( O4 D# i - global Datos# c9 D7 \/ I. ~3 I6 H, Q5 q& E6 f
- global P_dir 4 M# u5 J6 G9 ?5 A- Q$ E) h* S
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
) ~/ A% N: U$ @. | - global Datos
8 I( \2 i5 }7 ?7 |8 O7 K1 _ - Datos = {} G# i# j( I3 i9 I* g
- headers= {% m5 Z" E5 D* }; ]
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
1 R; _8 ] o- S' P - 'Accept-Encoding':'gzip, deflate, sdch',. Y; q* @% V* e
- 'Accept-Language':'zh-CN,zh;q=0.8',7 r! j7 G' x8 O9 v& [9 T
- 'Cache-Control':'max-age=0',% k+ u( V1 ]: d& z2 @( R/ l
- 'Connection':'keep-alive',( z+ v* D K( u7 \1 f( y& C' q
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', y9 D8 P4 I+ O6 ~$ b, s7 [3 C
- }
, h1 h+ S' b5 c8 s+ T. [- O8 Y - crawl_config = {/ M. V# O" G5 Z' F8 C1 E, o
- 'headers' : headers,
& V. T/ m! y* p - 'timeout' : 300
) [ S' g& n# y+ [5 e5 i0 D6 C. s - }
: `0 ?0 A% N6 A. s2 r - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
: g- Y* N0 B- H' R+ B# b - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")4 E- G& t# _5 C4 w) w- A
- try:
) {; J: P, W7 _ - cursor = db.cursor()/ G+ A9 u" Y( Z; d
- #注意此处字符串的占位符要加双引号"%s"
0 ?6 E3 I& y$ J - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
% q! N4 g& y. r! ~9 o7 |9 p0 _6 S - # print(sql)
9 J. x; _% M6 V3 y$ I' D - cursor.execute(sql)
6 O& c- L* q2 {( m -
Z; Y7 G+ t# X! H - #qid = cursor.lastrowid% @- n. D8 |' F: l( E7 _* [3 m% P
- #print(qid)2 q0 T+ E4 g5 [ @ d
-
2 |0 \: m/ H6 e. n4 G3 g - db.commit()
4 x) s+ z8 U4 h) B1 b - except Exception as err: Z7 `: |/ \9 r$ ~
- print("Error %s for execute sql: %s" % (err, sql)), p# w! u* }/ D0 O
- db.rollback()
* P- ^4 L( t; G9 w - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
# l( p& g% S2 T - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
1 U( k. N) m; F9 r - try:8 f- [, h9 n$ M t5 q8 C, ]
- cursor = db.cursor()
6 S0 u% P4 _6 w: R( ` - #注意此处字符串的占位符要加双引号"%s"
' G# ^0 L! g+ W6 R; a - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);( p& u8 D: N: b$ t- s0 i
- # print(sql)3 V- [7 v/ g$ v1 {. r' S
- cursor.execute(sql)
6 E2 C% d+ f) R/ P' n8 ]4 [ -
2 [' V; S8 k& [4 ]! ^' r0 l; N - #qid = cursor.lastrowid4 t+ ?4 ]9 [0 B, r3 @7 i8 g
- #print(qid)
# R G4 M8 a% J) `. d - 9 h6 `/ e3 y7 Q
- db.commit()
/ y8 B: e. B5 X/ j3 T - except Exception as err:
; {: I* o# F# i Q - print("Error %s for execute sql: %s" % (err, sql))
$ ]& E2 P& ^; `% ?" v* g - db.rollback()$ @6 K2 c+ {6 t$ Z5 \/ g; _& c8 N+ W
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
) }; N" D5 y7 [2 Z: N3 h - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")+ i4 @# a, \" C$ F9 W4 Y l
- try:
- }0 q5 ]+ g7 R0 r - cursor = db.cursor()
$ @$ F; D6 y6 i; J$ L0 |/ N. z - #注意此处字符串的占位符要加双引号"%s"
) I5 b, h; @# Y" Y8 @ - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);+ p0 O$ `8 {5 @3 K9 y. _# R, `
- print(sql)
% }6 j9 w2 l2 y0 Q1 l - cursor.execute(sql)
; _2 h- Y3 H; h+ D( I: \ - print(cursor.lastrowid)
1 R) @1 N4 y7 D# s4 S# g - db.commit()
! y4 c+ C+ b# z - except Exception as err:3 s4 s' y+ X% Q6 o0 z4 X, k% @( k+ |
- # except:
# P5 R( N, L- N/ @: H- T7 w - # print('Failed')- v( X0 U) X9 B$ U* U# N3 h7 S' d
- print("Error %s for execute sql: %s" % (err, sql))
; d. g$ B' b" ?! s: R; h) O - db.rollback()
2 p5 s0 ?$ ^. | -
* m) e7 _6 q/ s1 L - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 5 T3 D; I: }( o
- reload(sys)
c+ y: Y% k m) M) ]/ \# ~+ M - sys.setdefaultencoding("gbk")
. x' M! [( G% r$ s' C - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址5 N8 ~& \! i' s% [' C% _
- locoy_data = {
' B5 C8 Q* j7 r1 {* N - 'my_u':'用户名', #后台用户名$ a, ^4 w F$ E7 {, j( Y
- 'my_p':'密码', #后台密码
4 Q# L$ \: h$ W! s - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
$ {! N2 H' r: ~+ X8 v+ a1 d4 E+ p - 'caid':Cater_Name.encode('gbk', 'ignore'),
) j" Q' c" z1 V: W - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),1 c6 Q- ?, C, U- x0 @# Z$ c% I
- 'article':BookConte.encode('gbk', 'ignore'),* H+ w: C9 F* d4 t# b
- 'author':Book_author.encode('gbk', 'ignore'),, q+ Z& S/ j* Y; {/ J) v
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),' c3 k: a$ k. ^& S! _
- 'thumb':Book_img,$ V- }# k& s) u/ M, X! J
- 'content':Book_Introduction.encode('gbk', 'ignore'),5 U# C2 i3 c( v
- 'abover':abover.encode('gbk', 'ignore') 0 b# W) [0 [7 ]8 F0 f. n( d
- }
4 k) m. L7 x4 |1 y5 M8 X4 E6 x# _ - res = requests.post(locoy_url, data=locoy_data), h/ E: O4 ~2 g1 m7 J% @8 G9 k
- print res.text$ m* p1 Y$ ~3 _
- print res.content8 O; ]& e$ n* z7 ~7 e+ _
- # print Dsd" ?" u; i- y& J
- return res
$ U0 |' T' z- p) A - 2 F# J0 e* |3 J# G/ v/ {
- def __init__(self):
T7 A. s. X; _- s - self.base_url1 = 'https://www.****.cc/'! s' k0 w, u% `/ s6 h3 O$ T
- self.base_url2 = '/'& Y! j5 L' ^* Z. y
- self.CaterId = []
. |5 ^% k/ l; W. t4 a) f - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
! F6 |7 Z9 V" A( C - self.page_num = 1
! Y: I2 w3 v8 p1 Y& R4 x - self.total_num = 200
, d2 }, m6 H4 C4 N& d: [9 | -
9 j n) v8 Z: W: F/ q - @every(minutes=8 * 60)
2 C) O; y% W N0 y5 B - def on_start(self):8 V* c' T$ G- Q( S2 y5 q
- global Cater_Name% c; k9 b" ^4 _4 I$ s! o* B
- Cater_Name = []
% F# H3 B m$ ~& H8 J- \ - while self.page_num <= self.total_num: 8 R; \" b( Y, _0 \& V8 Q% m
- for self.CaterId in self.CaterIds:, u _0 i. I" H" N9 F: G0 G
- if self.CaterId == 'xuanhuan':
& m+ p2 T$ V; D' q; Q' u! R - Cater_Name = '玄幻'4 c3 `6 U! B$ m
- if self.CaterId == 'wuxia':
8 t" c' Y/ n& c3 F) G& t' L) B - Cater_Name = '武侠'8 R! a4 E7 T. k; `
- if self.CaterId == 'lishi':1 m* a# H" `6 s4 F
- Cater_Name = '历史' . G4 J- a4 B7 g% J# e+ X' d
- if self.CaterId == 'yanqing':
6 _; ]( s& e6 u% M - Cater_Name = '都市'
5 O& Q! v# [# V4 e" C - if self.CaterId == 'nvsheng':0 b) d; f: S1 d& e
- Cater_Name = '都市'
' s5 [4 e* O" ]* J+ x6 o - if self.CaterId == 'kehuan':
4 @7 ?3 Z8 H _* q+ H) t5 ~$ G, k - Cater_Name = '科幻'
# D* F! j! I! i( r8 o - if self.CaterId == 'kongbu':
# E: ]( T2 G* A- d+ f# N - Cater_Name = '游戏'
% h/ ~( j. g- o) X - print self.CaterId6 R/ q5 ]) G1 w+ f
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
U- `7 t |" y+ N - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)$ M0 a; f v1 q) z* R* F( \7 ^
- self.page_num += 1
1 ^7 n( F* o+ R: G- L - + K, Q* W' O: w( T
- def list_Caterg(self, response):9 V1 e9 D% c* O6 r$ A5 X
- Cater_Name = response.save& Z) `. L; V5 J5 _
- for each in response.doc('.pic-list a[href^="http"]').items():
8 e0 ~! H' h+ b: v - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
; u+ r! A0 {( _) b -
: i9 \# }* j4 ?* _9 d* @ n* O - def list_Caterg_detail(self, response):% y+ O- Q0 N4 L) p9 {, S
- Cater_Name = response.save) ~8 F; V( ]" D+ Z' F6 X
- # print Cater_Name4 H( h0 d1 A' l4 k; T* m; l6 C, @
- Bookname = response.doc('h1').text()
2 L5 V7 ^" y- Y: Q ~% N - print Bookname- D: p- f3 q, J8 j6 c: _- }
- Book_author = response.doc('.authorname > a').text()
# d, Z$ S! _1 r: S - # print Book_author
% S' ^6 }, b/ z) } - Book_Introduction = response.doc('.book-intro > div').text()1 \: d; Q6 d: a) H! C
- # print Book_Introduction
0 a: f! |/ k+ D2 ^ - Book_Synopsis = response.doc('b').eq(1).text()
/ W+ V- q# z& y - # print Book_Synopsis
, Z: \8 @2 S# N( U- \4 F# _6 D - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]; `3 _" e* X- ^4 ~
- # print Book_Palabras8 Q1 J/ ]( h/ ^" t P: K
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID$ N( G5 G+ f$ O7 E3 t# O/ E4 F
- # print BookIDs/ r% J6 e9 G3 B; r! G+ r. P. I% o
- Book_Dates = str(datetime.datetime.now()) . u. J0 z6 `7 A2 P" K6 a
- for imgs in response.doc('.bigpic > img[src^="http"]').items():' ]& q) X6 Y7 o3 ~2 w Y
- img = imgs.attr.src
" |, R. p" S5 C% Y( e2 _( F5 J5 n4 p - print img( l7 k, i+ e9 I5 e5 d. s
- #小说封面下载
5 V9 C5 T: g* A7 `' W3 d - extension = self.getExtension(img)
# Y. [, @& U5 N0 X0 O7 [3 Z - name = self.getname(img)
; `8 j, O8 P) ~" j- D) B3 | - file_name = name + "." + extension
$ U- t6 Q. a$ V! W - imgDir = P_dir + name
( p- w# U% ]& L, s( U - Locaimg = imgDir + "/" + file_name
& k8 B s9 L- Z9 a `9 H - print Locaimg
! W3 ], ~' z: W0 {9 b6 R - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
) M3 q+ B4 |, V! U1 T - print('attachment url is ' + img) #
" q8 E) u" ?! F' X - Datos = {% h7 L, m! V2 ^* M3 l O
- "Cater_Name":Cater_Name,
$ `2 X3 P; `3 s0 U: q9 x - "Book_author":Book_author,! [9 r6 r) c' ]! e
- "Book_Introduction":Book_Introduction,$ g; V! A' p ^7 |* I
- "Book_Synopsis":Book_Synopsis,# @5 Z0 h- S5 D$ N& Y% P# e
- "Book_Palabras":Book_Palabras,
* a( u# X% F" C - "img":img,: H! C1 K6 b& h
- }
; i1 }- I) h0 Y- b - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布! |" Y5 l! K: J$ g
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():- T& E% f, Q2 ]& x* p
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
8 W; |. @& s1 j% f# J, N6 B - 8 n+ s4 s. \, D. k5 A8 O" g
- @config(age=8 * 60 * 60)
2 S9 q4 G# l1 E9 M - def index_page(self, response):
# E! D8 F6 D1 M9 U3 {! W/ L. V9 \( o - Datos = {
6 a- t/ c& ]0 d - "Cater_Name":response.save['Cater_Name'],# F2 X% J$ F1 T, Y F" U; Q9 H2 @
- "Book_author":response.save['Book_author'],6 c6 a5 X# K( b$ A3 n% l6 }$ n
- "Book_Introduction":response.save['Book_Introduction'],% k, j' ]4 z: H( |
- "Book_Synopsis":response.save['Book_Synopsis'],
: v- r. q7 `+ U0 W3 A5 F& A% C - "Book_Palabras":response.save['Book_Palabras'],& ^5 s" z% U: T( O; K, E4 d
- "img":response.save['img'],# ?5 i' g3 o' J* F
- }
2 v- j- J T; n& h+ k6 o4 P - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
, W4 w' D# h% ?, ^* N! [+ i2 u - # for each in response.doc('.chapter-list a[href^="http"]').items(): + k" m, `$ z( B. S
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
5 T' s, Q8 S# }# M0 f6 ~( K - @config(priority=2)
/ o9 w$ J* y* J' O; J) {/ B/ L. d" {5 k - @catch_status_code_error+ W6 Y6 d$ }: U: R
- def detail_page(self, response): & _: a! u) J6 I' P
- NewRe1 = u'哈书'
# c: B- j6 B* n% z' { r$ ] - NewRe2 = u'huhjsd.CC'" c+ Z, u$ {% w4 _- g W4 E& h
- NewRe3 = r'^\\n\\n'
7 |3 E# Y; r: R$ Q2 M! b# {1 u - NewRe5 = u'小说网'
6 i- z$ V; c1 k2 T& E4 z* K: ^- W - NewRe6 = u'fgdfgf' j+ j0 G0 ^) J. f' t
- NewRe7 = u'fgfgf'
2 C8 x6 }; X+ S! ~) c - NewRe8 = u'ffhgf': f! i& W' P4 U1 a0 P
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
2 B3 ?7 \0 c# U1 i5 N( e - ReC1 = u'静思'
; j( ^! |) j9 z ?; L- V2 v. c - ReC2 = u'aghgf.com'( G8 o# R! o8 m$ e# i
- ReC3 = u'aghgfh.com'8 m+ G, }/ L" Q P
- ReC4 = u''
9 d, T4 n* W5 N M/ @) j/ \ - ReC5 = u'文学网'' y& M7 j- U9 O- t, c/ e( X. U
- ReC6 = r'<BR>'+ ]. t9 q$ z- w+ g
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
# _8 \; @, \2 y% H( P - print Bookname. \2 v/ T! G0 |; m* p2 ]
- Cater_Name = response.save['Cater_Name'] # 小说分类, O* e& d' V/ q/ Y, l0 [
- Book_author = response.save['Book_author'] #小说作者
: S \1 Z& g t5 \6 O - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
: H3 G4 _- v0 O% l - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
% J: p4 x6 t( H: h, Z - Book_Palabras = response.save['Book_Palabras'] #小说字数8 K' K) D8 J" p) W
- Bookurl = response.url #小说网址
/ {1 J' _' z& i$ D - Booktitle = response.doc('.article-title').text() #章节名称+ m( l" s7 E$ }+ ]% V
- BookID = response.doc('.readset-r span').text() #小说ID% O7 D9 a7 S7 S: q; F1 V* N. b
- BookConte1 = response.doc('.article-con').text() #小说章节内容/ W. B' m7 P' Y" [
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)1 w2 R7 _% m# q; N0 T
- Book_Date = str(datetime.datetime.now()) # 采集时间7 Z, B! L) r4 h! M! N
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)5 U, f% U( y) Q6 e5 l# F( }) w
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)& x( w) N8 v5 }7 B2 I
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)4 K u# L' M; \' ]. J. [+ G
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)6 v$ [% Q3 d- o& ~0 ?4 B1 N/ P
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
3 ?, g3 X0 l* Y/ H1 j l: J - BookConte8 = BookConte7.replace(NewRe3 , ReC6)$ Q0 x' p9 N- E2 T7 f
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)* k0 g/ w. a# f; L! E- f0 A' k9 u
- BookConte = BookConte4.replace("\n\n","<br>")" U- h9 U& D2 ]; x2 i0 ~! Q
- print BookConte) |; x1 D' ~" ^
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1). B3 E8 ~! n- ^7 j4 q! I; r
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)# Y- D. e+ R. g3 ]
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3), }* m J* q9 _: r( s
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)$ k% X/ I6 Y/ I- r, A
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] / p0 d( F- w/ k! P' [5 I
- Book_img = response.save['img'], #小说图片
8 ?7 E0 ?8 y1 H3 a1 p$ A# ]5 } -
, U8 t5 `- t( W1 H4 x - #insert into MySQL 小说入库" |0 U' @' n# @+ }
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布- M0 ]5 ^: e% W O' n. s, s; d7 I
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布! q4 ]# l8 o Z% R
- #post提交发布: r5 @7 g' j6 o7 C
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
, p) l& M$ c2 K: B+ w; m - Datos = {
+ y& E: n S0 d- E( ~% ^ - "Cater_Name":response.save['Cater_Name'],3 l |0 m- O o
- "Book_author":response.save['Book_author'],
. Q. k# g+ G& q - "Book_Introduction":response.save['Book_Introduction'],
9 q: U, b; ]8 l- B% [+ _ - "Book_Synopsis":response.save['Book_Synopsis'],: d, G/ [" t; X M+ a6 o1 H
- "Book_Palabras":response.save['Book_Palabras'],
$ H* K! o+ t/ r, |3 ` - "img":response.save['img'],% k# y6 {6 [" A1 c, h. {
- }9 K+ y+ J- @, X
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
) A3 a1 e2 D$ m- u [' ~8 c - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
2 Z& U5 l$ t3 i# X' R- @ - return {
: I/ B& H2 a% s- A - "Cater_Name":Cater_Name,3 u1 o4 P& C5 S( a' B9 Q* V
- "Bookname":Bookname,
! }" f5 @2 o5 v$ m# n4 | - "Book_author":Book_author,! I6 `4 k: u8 u# E' z
- "Book_Introduction":Book_Introduction,
' n& r4 I: j( V4 g; E& f5 | - "Book_Synopsis":Book_Synopsis,
8 \/ q3 r! q. V1 o. }, F - "Book_Palabras":Book_Palabras,9 I: d' T7 `9 i m: L' m
- "Book_img":Book_img,
: f5 X" \1 j) k1 ` - "Bookurl": response.url,: M; y! L5 m. k; _# h% \3 B8 [ A
- "Booktitle": Booktitle,4 q+ D0 a. w6 g- }! r$ D
- "BookID": BookID,* V* j# z9 r/ i/ d W9 @7 n
- "BookConte": BookConte,/ y9 f: d! b9 h! ~! f
- "Titleid": Titleid,
( v7 l; x3 H6 t; V7 h6 f - "abover":abover,
' |9 p# h$ \1 W7 U7 a! u - # "Book_Date" = str(datetime.datetime.now()),
$ b, m6 ^9 ?* v2 W' w - }
/ y7 U2 {1 x2 C1 V; W/ B& R& U - def download(self, P_dir, imgDir, file_name, Book_img):2 k1 S8 A3 R. E+ L
- if not os.path.exists(imgDir): ' D6 X& d9 i0 A# \5 I
- os.makedirs(imgDir)9 V( P/ B1 _1 J, z
- file = imgDir + "/" + file_name1 ?! x& D$ I2 g `; L
- # print file0 {+ m6 {1 j8 @! X- f: ]" T' D
- f = open(file, 'wb+')
( }. e8 y7 H- _9 Q5 _6 K - imag = requests.get(Book_img) $ x h( z( h. y
- f.write(imag.content)3 {5 Z$ O. q. |! b7 a' |! [
- f.close()
& x+ Z F+ p3 Y6 q7 \3 w - #保存图片前1 D% M3 K: i( \8 {
- def save_imgs(self,response):
% u: f1 y: |# G# f* S7 z - content = response.content3 q: Q( d. X4 j, B' W
- file_name = response.save["file_name"]
- ^6 Z( o3 w+ y) f* @ - imgDir = response.save["imgDir"]
# e# n, @7 f. B% D+ } - file_path = imgDir + file_name3 M# i( h s' A0 R" p
- self.save_img(content,imgDir,file_path)/ f" }! S9 {* H9 m- S, Y- m
- #保存图片
& z/ D: V4 V" D4 o0 a% }* A9 v5 O - def save_img(self,content,imgDir,path):7 }$ A( V0 j7 `1 Z4 |4 Y5 g1 U1 J- K
- if not os.path.exists(imgDir): 1 b0 P+ |# t1 F- y* N
- os.makedirs(imgDir)# y* d# ^! q( F+ Q! t
- f = open(path,"wb" )2 {* P& F) d$ C' F) \6 ^* @
- f.write(content)
0 O" X( X+ z; t& S ?" n - f.close()
- c: K! J3 t+ M. o# ` - #获取url后缀名5 V; u# k8 A7 l, X& s6 ^, @- A' M4 Z" R, x
- def getExtension(self,url): + i- n6 ?9 N! P* }# F# Q
- extension = url.split(".")[-1]4 S; n# X6 S; g! j5 w5 F
- return extension
j" |# w% G' Q* v& w. c - 8 n4 X3 i/ q; g9 |0 c
- #获取图片名4 {- u6 K+ N8 @! p5 R' K
- def getname(self,url):
& b: ?5 Y# [6 U; }8 o; g) t - name=url.split("/")[-1].split(".")[0]
l& q. _) h" @0 h - return name
复制代码 5 Y5 U6 s- Y# W+ d% s8 s. W* y
5 C6 t$ J. }4 A$ B$ Q |