Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!* n4 ?* R U, {" C; `6 f
- #!/usr/bin/env python- F) ^3 q9 } Q7 a+ F
- # -*- encoding: utf-8 -*-
" l0 D( @$ m V9 A: ] - # Created on 2019-05-05 21:43:11
& U* |- }% U4 H - # Project: XiaoShuo0 s8 P" {9 f% X8 P9 E# q+ o# o
- ! S; S% P2 s6 P/ D
- from pyspider.libs.base_handler import *% }) K# X) d$ G5 f0 A8 A5 Z
- import pymysql
' v2 t# {% ]7 C! S, B - import random/ r+ _/ `0 H* _2 j) \9 j8 A
- import datetime/ a0 Q5 k6 ^" q% T- M2 R
- import urllib2,HTMLParser,re9 `* E) C' D8 B% S ]0 M
- import os
# C5 z+ z& `. c7 G - import sys
3 n7 ~7 M. W9 f4 ~8 U3 K - import re6 v4 f3 ^6 ?* I4 G) w8 W" p
- import codecs
4 M$ i+ p; Q# ?0 s; g - import requests$ L# j7 K; G; o& ?+ s H2 _" u
- import json, D( W) P+ D# t
- 4 w" M5 x6 j' o/ l7 Y. K
- class Handler(BaseHandler):
1 G. m4 g2 q" x - global Datos- w8 `4 W. }7 G5 [; {& E) r
- global P_dir
; G, \5 l8 q6 ~( x3 _% Q2 r# D - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
! t$ H6 }4 ?! }7 W - global Datos
& ~7 E9 c. x/ W* ^. @$ { - Datos = {}
7 l) s9 c7 Y) ^; O- { - headers= {
; k+ B0 N; w' A/ B - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',% w' F% k9 E" W0 F, i5 A) S K6 h
- 'Accept-Encoding':'gzip, deflate, sdch',- r9 { R, T3 `5 ^! J: b' i
- 'Accept-Language':'zh-CN,zh;q=0.8',
& l B! M# M, n% Y& P* p - 'Cache-Control':'max-age=0',
9 J) E i9 a# z - 'Connection':'keep-alive',
! @4 {' ]* N; X - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'5 g' w7 T( N' Q8 C6 ?1 w
- }
7 g, Y2 D( k8 P s4 k( y - crawl_config = {
* c- x6 e* J8 t - 'headers' : headers,
- s1 l E3 Y6 I- _ - 'timeout' : 300
* e8 s) U# B9 L3 g) ~ - }
5 m/ B1 p( u, a! A - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):+ U0 O: o! Z4 v$ _
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& q( u9 q/ s7 W# s - try:( ?+ G, h* Z0 a. J7 q8 Q* {
- cursor = db.cursor()
8 A7 l; r7 E3 e3 j1 d" a6 \ - #注意此处字符串的占位符要加双引号"%s": v3 P% o$ K" f, v- ]4 O1 r
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);9 G* T: Y" ?5 \% k0 E. n
- # print(sql): r- z4 \! c! W: J4 j6 |
- cursor.execute(sql)! Z" M9 k, M" i& b4 j, g2 ]' y
-
/ `3 G9 A5 \0 {& O+ Y7 r" R - #qid = cursor.lastrowid& k: ?( {+ ~2 |- D% P* Q
- #print(qid)* K- L1 F% O. d5 U& K$ M
- 5 N9 K+ s0 L, C6 `+ q/ x
- db.commit()0 j' A6 X9 F- |: r' j
- except Exception as err:- Y! T+ P2 M4 r$ a8 @7 n
- print("Error %s for execute sql: %s" % (err, sql))& G9 f; i1 Q9 U4 q3 n! B
- db.rollback()
" b1 h& }0 S4 ]+ p$ U - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):, a! ~/ c; h4 S+ t- }
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")4 M; X8 U# c6 r* m7 L. F2 i- Y! Q
- try:, d& F5 D5 z( _- N
- cursor = db.cursor()
5 Q" L4 O5 {; }' w7 R - #注意此处字符串的占位符要加双引号"%s"; Y c. L( ]# g X
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);' v: \( X+ [7 ?; ?2 q# J
- # print(sql)0 ^8 w6 j' ~: |1 V5 e- ]6 g
- cursor.execute(sql)
6 |" s1 M5 Y3 O7 q) ^ -
: c4 E2 _7 X6 L8 q W0 E( L - #qid = cursor.lastrowid
1 x: n) p# v# V/ e. y* Q3 t, f - #print(qid): T7 C5 x8 B5 [* h: D
- ( v7 x9 x8 c) u2 y' P% p
- db.commit()
; ?: B" K4 j4 o, c/ e3 w: e - except Exception as err:
9 s, d+ S7 G3 Q% y5 U. l( q; _% U - print("Error %s for execute sql: %s" % (err, sql))
o, y) E6 j# I - db.rollback()
`; W5 b6 r. j. J9 g. O - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):! w. F: O5 X4 M/ v+ }; ~( b
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
) m1 b1 z: c9 b: C$ p - try:
7 t" `" r' |+ W( e2 q9 a - cursor = db.cursor()
/ u5 G. ~6 s' a: C9 r( D - #注意此处字符串的占位符要加双引号"%s"1 k# U- q/ O6 O
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);2 d4 i$ t( W* B! v% Z0 U
- print(sql)
: Q5 B. u+ ^0 L1 F5 { - cursor.execute(sql)
6 V% G) l. T6 O$ I* p8 j% z - print(cursor.lastrowid)
! n# k4 `1 Z$ ]: Q3 k$ \6 j - db.commit()- f% ^7 e; {# U1 f9 P% Y5 [
- except Exception as err:
7 v8 F) l3 V, n0 d3 @ - # except:* K. ?, U; F3 ~- f. J% \7 n5 J
- # print('Failed')
$ D. S+ S6 [5 ]7 k3 W - print("Error %s for execute sql: %s" % (err, sql))$ ]- b% q/ X9 l6 ^4 `
- db.rollback()
+ @: u- p o( T% M7 J+ [3 z - 5 G2 g! z2 Y$ \, D2 h
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
5 @" X) |( S3 l& t - reload(sys)1 E/ r( V" c3 b7 P8 }4 M
- sys.setdefaultencoding("gbk")
% }$ J8 y: J# X3 j - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址' a1 I3 ] a4 n. r. F
- locoy_data = {
p% ]; _/ W3 E* y4 V, ~* l% \ - 'my_u':'用户名', #后台用户名4 T' z: q* |/ i+ V) y8 [
- 'my_p':'密码', #后台密码% T! _" h4 ^1 p& S+ C* F2 u& d! h: U
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),( {9 \) D- S, x: A( s
- 'caid':Cater_Name.encode('gbk', 'ignore'),# V5 T2 b9 A# F5 K4 @1 b
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
" M$ t+ M, |4 B$ \4 s2 b. r& R$ P - 'article':BookConte.encode('gbk', 'ignore'),
( f+ L! _9 h0 l: r - 'author':Book_author.encode('gbk', 'ignore'),! t/ l$ p3 z( H* P3 S- T. F$ T
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),% N" J3 ?- m, \7 C, \- J1 D
- 'thumb':Book_img,
% F! `4 j$ _7 ^2 O" h: m V - 'content':Book_Introduction.encode('gbk', 'ignore'),
) ~- Y6 x( z8 J3 `( c - 'abover':abover.encode('gbk', 'ignore')
# ?8 m- t+ e s6 g: |$ d; N5 ~ - }# F: D0 ?/ x1 z" f! s- c
- res = requests.post(locoy_url, data=locoy_data), n- s3 n6 y4 p$ I6 X
- print res.text
1 f& e3 G0 o9 l0 r7 W2 h9 `1 q - print res.content. N* |6 I z- V$ r8 L
- # print Dsd
# v" N: _5 x, O: `9 i - return res
' w w1 w2 y5 }% T A -
; L! }& s/ D5 a- H" e& K) N - def __init__(self):" O$ r' F9 S; N- e% g3 B
- self.base_url1 = 'https://www.****.cc/' N7 k( c" b$ l$ M
- self.base_url2 = '/': o' F1 t# Y& h* G6 \4 M
- self.CaterId = []
* _( K$ ^- _. j- i0 q - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
, y, k& O/ @# l - self.page_num = 1: C6 \0 T% X) `
- self.total_num = 200
) W) i" J) O5 j. f -
$ s- r! ^6 `8 E$ n) ?) o( i - @every(minutes=8 * 60)7 i: q) E! A1 b
- def on_start(self):. @! `1 j' x1 N& v% x
- global Cater_Name
$ Y. y( w0 \" z8 A2 F$ U- ? - Cater_Name = []
, S; O- J/ a3 P4 {; G - while self.page_num <= self.total_num: , v# }2 P! \% o8 ?8 O5 A
- for self.CaterId in self.CaterIds:) M+ W- q6 e7 ?5 q# f
- if self.CaterId == 'xuanhuan':
+ ], b" r- k$ x1 J7 J - Cater_Name = '玄幻'# S+ [/ @- Q- _
- if self.CaterId == 'wuxia':9 o6 D& R: E! ^) n5 A
- Cater_Name = '武侠'$ }% y' K; A# v6 W" y5 p- Q4 ]3 \
- if self.CaterId == 'lishi':/ h0 j4 T& |& d1 S
- Cater_Name = '历史'
3 _! ?5 w6 W" m0 c! T - if self.CaterId == 'yanqing':& |. M% L" w4 N+ ^# K1 l' a
- Cater_Name = '都市' 4 [0 o9 |: J1 q1 X( x# V$ y. G9 S
- if self.CaterId == 'nvsheng':- ^! O7 w4 r! O# G2 Z8 O: N
- Cater_Name = '都市'
6 I6 J6 G- | `$ A0 e - if self.CaterId == 'kehuan':# w0 ?! _. E/ E- F) p t
- Cater_Name = '科幻' 8 U2 R3 _% t8 E3 a. l
- if self.CaterId == 'kongbu':, X6 A: d! K7 K7 u$ |
- Cater_Name = '游戏'
$ G0 w$ w9 D! O/ E# d; m - print self.CaterId
0 D# [" U. M- ? - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
) t! H. s# g( |* r) Z& ~ - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)6 f7 }2 g8 a g& ]* V) ~( t' `' O
- self.page_num += 1
: U4 D$ c. m, \* n: E: A -
1 F2 P" p+ z3 j0 s( p/ | - def list_Caterg(self, response):
( ]4 K% U. J( b - Cater_Name = response.save
; X: l4 t0 `; a - for each in response.doc('.pic-list a[href^="http"]').items():. [& j. c- H, g% ?) t3 A$ z2 p7 z& Z
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
8 p3 y+ O, T1 m. x% l -
( C5 M$ l/ y8 d. r" B* U. H - def list_Caterg_detail(self, response):( T# i; @) b0 u% s0 p
- Cater_Name = response.save
$ b( a* z$ L% ]! ]. c2 {- Z, d - # print Cater_Name
) @) U* f0 q& S5 p1 e7 m7 N2 ?' E - Bookname = response.doc('h1').text()
e- t& d) ?2 M: {* V- I - print Bookname
" v$ u! ]2 Z& y" R, W0 r* A' O - Book_author = response.doc('.authorname > a').text()
) o; f* r, [; N: j - # print Book_author. ]+ J8 w% l2 F+ j' }! L0 c# U5 E
- Book_Introduction = response.doc('.book-intro > div').text(): Y, q: J; k7 O/ ?6 y
- # print Book_Introduction
- ^ p. Z0 p& c2 q- p - Book_Synopsis = response.doc('b').eq(1).text()
' N: _0 t# ]. J$ _9 o0 [/ b; L9 O - # print Book_Synopsis/ l: D8 D; s! F$ y
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]* I6 ^# l% d" N: ~* y* Q6 L
- # print Book_Palabras+ S- L1 N& i; r9 ^: w
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
# d" v( E1 ]4 P0 D, f% S3 F9 {8 R - # print BookIDs
& ?$ I ?# D! d% {2 q0 t - Book_Dates = str(datetime.datetime.now())
& z* a: |; N$ y6 n" S- s: Z - for imgs in response.doc('.bigpic > img[src^="http"]').items():
! a3 x( a1 V; g3 l! D* ?( M+ d6 A - img = imgs.attr.src
; G. l( Z- U$ q% x9 g6 Z - print img
. V" H' _ l; A5 M - #小说封面下载, l* i/ ?9 b. }5 M5 a
- extension = self.getExtension(img)
. J0 x* }" U1 N7 Y- a/ a& C- E" y - name = self.getname(img) c- D' O- I* d+ y+ G
- file_name = name + "." + extension( i" |# G( F# d3 B6 B. _7 ^
- imgDir = P_dir + name& q- N9 p4 L) T" r* z
- Locaimg = imgDir + "/" + file_name
, |4 H# W! |$ Z3 }% B6 o - print Locaimg
2 I" O, U/ N* r- v - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
! Y' k4 } r4 n3 ] - print('attachment url is ' + img) #
( m( f8 G2 f* c8 Q- N6 M# Z, _8 w - Datos = {
4 A; v6 Y6 Z2 @; z - "Cater_Name":Cater_Name,
3 d( l& E9 u" G. s - "Book_author":Book_author,
. Z, Z$ l9 H' L - "Book_Introduction":Book_Introduction,% v, A2 D: P- P7 a8 S
- "Book_Synopsis":Book_Synopsis,1 m* N/ ]. h8 r i V6 f h
- "Book_Palabras":Book_Palabras, ]7 @2 J* P/ l
- "img":img,
( C1 y+ g. I) i5 o& E - }
, O! J9 l5 o' \0 E. k# } - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
) e% D, ^" J# v' m1 F+ E [! V( N - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
+ ~1 _/ \8 V5 O) L - self.crawl(each.attr.href, callback=self.index_page,save=Datos), Q" d. V6 B1 n1 t$ y8 `
- 2 d1 o+ f7 m8 | }0 _
- @config(age=8 * 60 * 60)
+ e# r v0 ^, s3 V - def index_page(self, response):
' m/ Z+ r# @& T( U* o) B* v - Datos = {8 s% z i. z \
- "Cater_Name":response.save['Cater_Name'],
3 B! T) a& T" r+ ^% X - "Book_author":response.save['Book_author'], e: Z2 Q* ], ` u+ D: u
- "Book_Introduction":response.save['Book_Introduction'],
& L% }5 A( s' ~3 T+ n5 P - "Book_Synopsis":response.save['Book_Synopsis'],
& d9 Z6 \ l: ], R) P" O$ q - "Book_Palabras":response.save['Book_Palabras'],: f4 Q* k7 ]5 t" { `3 W/ O9 h7 k
- "img":response.save['img'],
6 s8 M6 ^( p: L. [& V - }
" z) K5 z" S) m" R7 b - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
( C; H- k$ }) J3 {# L. | - # for each in response.doc('.chapter-list a[href^="http"]').items(): - B2 [8 g/ G% T b* C
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)# b8 \1 `. n0 l* q( z6 M( o, Y, e
- @config(priority=2)8 g/ ^; Q& I. k! ?! K
- @catch_status_code_error& O; N/ x- F4 m- N. }
- def detail_page(self, response):
; o o9 g4 g& K6 l0 G! U* I: k - NewRe1 = u'哈书'# s% l1 n+ d' I; m0 _
- NewRe2 = u'huhjsd.CC'
& [ `) O: K+ ] - NewRe3 = r'^\\n\\n', E5 e) h; c; @; a
- NewRe5 = u'小说网'
8 {% U7 A; z! c& P* O |% F! _ c: q - NewRe6 = u'fgdfgf'
' |! c E- C9 h- P - NewRe7 = u'fgfgf'
) B3 l; {; {- _7 A" H* s - NewRe8 = u'ffhgf', d2 c! {3 D/ B
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
9 {4 A$ ]3 I# S2 P9 `/ i' R) Y - ReC1 = u'静思'6 P1 P$ m6 [/ y, P- m2 l
- ReC2 = u'aghgf.com'
- t4 C# a P9 U, `6 H! q* q7 y - ReC3 = u'aghgfh.com', O* f$ C1 g+ N" {$ W
- ReC4 = u''! ]) G- q2 m |1 i9 g% \
- ReC5 = u'文学网'+ b. |8 R, @2 I& H$ k, P
- ReC6 = r'<BR>'# {5 f4 C/ S6 V5 v
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
: ^4 [: z! s6 G0 Y" y& j( X) N: f1 ]% w - print Bookname& {6 _& M1 d( h$ c+ E; U
- Cater_Name = response.save['Cater_Name'] # 小说分类
9 y8 H: |+ Q8 [8 u5 K - Book_author = response.save['Book_author'] #小说作者
' `0 m3 i# ?/ _ - Book_Introduction1 = response.save['Book_Introduction'] #小说简介, p' C# U1 N: {8 U
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
2 |6 X7 G Q4 _3 Y! h0 \! ]+ T - Book_Palabras = response.save['Book_Palabras'] #小说字数( s2 \/ I2 D( J. J" s R: g% k
- Bookurl = response.url #小说网址# v' f$ P1 Y- D5 k
- Booktitle = response.doc('.article-title').text() #章节名称
1 \/ M! Q4 {3 [: H - BookID = response.doc('.readset-r span').text() #小说ID
9 [. Z! c3 i) }. u% l7 W! V - BookConte1 = response.doc('.article-con').text() #小说章节内容2 f/ P5 v" m) {# i
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
/ k& r @+ s. x: Y! k - Book_Date = str(datetime.datetime.now()) # 采集时间. W* K. @' |" y( C6 a
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
; s6 L; v- `$ O7 i$ r - BookConte3 = BookConte2.replace(NewRe2 , ReC2)4 K! {+ s% l; ]6 ~3 c8 x: K* y
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)3 l& R3 U, I# {
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)7 H+ w$ p# [( `0 `# P6 F
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)* f3 A1 ~ t* N+ V; v" z+ ]
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)/ p* M; h+ D* V$ d, O
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)2 r) k$ e/ t t3 Z/ G7 ^+ L2 g" u1 }
- BookConte = BookConte4.replace("\n\n","<br>")
& e) ?7 P( a% s* T9 _; L+ G - print BookConte+ M2 k3 Z0 O$ w# Z
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)5 E" [5 U) Z. E/ B; z9 C
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
0 W' r7 I i; A/ K+ a2 H - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
3 s6 }! w" s: |. t! l7 h' U6 N* R* _ - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
2 |, l+ |; M$ H* F+ ^1 S/ {( }5 N - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] * V& \- r8 T' ?, O% X
- Book_img = response.save['img'], #小说图片
- W- F1 u$ f3 w' V - 4 Z& M3 k1 E I
- #insert into MySQL 小说入库" A6 a, r4 [' L. z% c
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
0 F" ?5 g" [2 O5 q { - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布4 o- j! s5 h3 T, K1 w
- #post提交发布* o% }3 H1 w) i8 |0 E9 C
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
/ f# T1 F# y1 f4 V3 O7 n' i- ] - Datos = {
0 k! v! @3 r$ ?" q, Q- G* [ - "Cater_Name":response.save['Cater_Name'],8 B4 R7 H- d. m% T( a$ J7 p$ t
- "Book_author":response.save['Book_author'],
2 {4 c m; X4 n1 F/ Q' z+ ?+ [& w - "Book_Introduction":response.save['Book_Introduction'],$ {$ K7 [9 g" [1 {- R8 l( l/ A+ c. [
- "Book_Synopsis":response.save['Book_Synopsis'],
( C: L! n ?( W3 \+ F - "Book_Palabras":response.save['Book_Palabras'],
' o! r; R0 e, D& d7 n* b6 B8 u b' ] - "img":response.save['img'],
9 d' T8 L, T# P4 ~, y0 T# l - }& e* ~- l. w; a- X) Y
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():3 X0 S" d2 _$ w8 h
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) - J8 w' W6 G& }5 l1 X/ e( W$ C
- return {% a0 e \, m9 K3 U- r
- "Cater_Name":Cater_Name,
# e! S4 ^8 X/ K' A; j* @/ {" a' A8 f - "Bookname":Bookname,; {: p2 R& i; ^# P) _) k& c* Y- c
- "Book_author":Book_author,
0 B7 J; ^% _& A9 x0 i - "Book_Introduction":Book_Introduction,9 {8 }) }) q) F
- "Book_Synopsis":Book_Synopsis,5 O; Y7 ]) f; D0 t% {
- "Book_Palabras":Book_Palabras,7 o" A& _0 ~) Y9 d
- "Book_img":Book_img,
: X+ N+ L) S' P - "Bookurl": response.url,
* N2 N6 N$ l3 w1 j - "Booktitle": Booktitle,8 A5 i: N- Y- b' [" `9 Z/ n, V [
- "BookID": BookID,
: H1 x! X. y' m$ l7 O m - "BookConte": BookConte,; h( x0 G; C2 f
- "Titleid": Titleid,
" P+ h2 S ^+ I; @! B; x - "abover":abover,
( T/ I. B. U/ Q/ V5 H - # "Book_Date" = str(datetime.datetime.now()),
Y1 o* ~- R- d) B$ ` - }
8 y! g" J! C+ X4 R - def download(self, P_dir, imgDir, file_name, Book_img):
* d0 _+ h( T! \6 T' ?5 o2 z - if not os.path.exists(imgDir): ( n- r& K. ?* n9 d- n
- os.makedirs(imgDir)! l/ u9 G+ }4 k5 J) ~& }
- file = imgDir + "/" + file_name
0 |' X; u0 p/ Z8 C1 r8 d - # print file
' y9 g" F6 }7 A; o$ c# i - f = open(file, 'wb+')
7 o' f6 U% W# ]) c( o9 w - imag = requests.get(Book_img)
" ?% c" {- K! _! X; m - f.write(imag.content)
9 `8 ]3 i6 f, Q) [# _ - f.close()
" l g9 V- U# K" M: Y - #保存图片前; ]; T, o1 A4 l# X
- def save_imgs(self,response):
7 v! @8 t* {) ? - content = response.content% T5 K7 |# u3 }
- file_name = response.save["file_name"]
) d4 r5 f9 e( \6 E, d% c - imgDir = response.save["imgDir"]" }- u; E) I0 T: L
- file_path = imgDir + file_name9 L4 ^7 j5 b! M9 x+ C) R$ b
- self.save_img(content,imgDir,file_path)
% v* Q- N; b O( w* z7 Y - #保存图片) [( L3 H8 O9 H$ [
- def save_img(self,content,imgDir,path): k5 ~ p/ P! f4 p u
- if not os.path.exists(imgDir): ; i. V3 K* z! N, H; G- u& M1 S
- os.makedirs(imgDir)2 T$ y7 D) D9 ^3 Q
- f = open(path,"wb" )
7 U0 K! | x* O7 B. ^5 _( v- E - f.write(content)" H# b5 }4 i9 S+ D" ?- N
- f.close()+ _9 b7 P' U0 X$ X v
- #获取url后缀名
- j( O! }, d7 z8 V9 o - def getExtension(self,url): 9 { S2 A8 f0 o6 U
- extension = url.split(".")[-1]
& [/ {0 B& \- s, u - return extension
~" i. a( [1 G: Z7 v G* Q -
# f O9 f4 o. d! b* o4 N - #获取图片名
% x4 {' U" m: ~* p+ S) s' U8 v - def getname(self,url):- s2 w' |" D- c+ ~3 ^( X
- name=url.split("/")[-1].split(".")[0]" A) J6 E8 e! x/ P+ k# V0 f! B4 j
- return name
复制代码
/ N& p0 q- [7 B3 T% e. r6 r . u7 M% r/ T) B+ R2 ~
|