Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
8 d' W8 e% c9 x; v# i( B4 p" E- #!/usr/bin/env python
`( v0 ] ?! w# X1 z - # -*- encoding: utf-8 -*-
3 u+ e: {" f. Y4 o3 W - # Created on 2019-05-05 21:43:117 s- C, t$ ]; ?5 |) Q3 ? r7 d- K9 s
- # Project: XiaoShuo
' o3 Q' z$ V, A- x7 \. v3 w$ X5 h k - 8 i7 J) c& ~, q' o
- from pyspider.libs.base_handler import *& G2 J8 r! U* R, q" V' v
- import pymysql
# P8 i. ?# n6 U" s) G - import random G. S3 d, L- ~2 r( O n
- import datetime) |) m0 C- k: D
- import urllib2,HTMLParser,re
. `) P" q8 a# V1 p7 Z' u% @ - import os
5 h: P; @( k; {4 h+ R }+ [( W - import sys
1 K l' C6 Q- D4 |3 H; \ - import re3 v; } ~1 g [: U2 U* z* b
- import codecs
5 y$ o {! j# g- X - import requests
" S& W, I' [* @: Q1 `& K) ] - import json4 R( O* k4 g! p+ b1 }$ @; A! W& U
- 3 p& I4 a% s) x' v2 F, u' q
- class Handler(BaseHandler):' m( y( E+ N! Q
- global Datos
6 s" J$ ~( W6 Y7 {* l: j$ ]0 {. S - global P_dir ( f7 I5 o3 d- [3 z4 E8 j- q
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径( P# Z+ [1 U; f8 p
- global Datos% b9 g1 n' W5 V7 M L8 l
- Datos = {}- M6 P+ {, X3 N# N
- headers= {$ k6 k" r; L, F2 s9 v
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
* S3 k& W! J, O# R# T - 'Accept-Encoding':'gzip, deflate, sdch',
4 u1 Y" k; v6 B( Y- S+ i' S, r+ j0 _ - 'Accept-Language':'zh-CN,zh;q=0.8',
, x2 ` w/ v K b" }; Q/ P - 'Cache-Control':'max-age=0',
% n+ L+ G: Q. _. t& z4 E& m - 'Connection':'keep-alive',
- \5 m9 S. L( K q' t5 {% C! i ]; u - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'1 w* w# p8 H( a' [/ h
- }5 i; R; i5 {, C9 v! ^# Z' C* M9 s
- crawl_config = {
# ~) f0 P8 d* |; k5 j. i) M/ }* j - 'headers' : headers,6 A$ U/ U2 ^+ }- k
- 'timeout' : 300+ M# a! B ]: ?- a; d8 y
- }5 p' i, i4 I4 f& m& K
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):1 P- Y/ Q9 Q' }' M3 A9 o& |0 |
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
9 i3 J; F. e" P% T - try:
3 ]- P$ {6 j. w - cursor = db.cursor(): l( P) e1 D* ]/ P5 R: A
- #注意此处字符串的占位符要加双引号"%s") I7 j! z! _7 S$ L+ B
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);+ t' X0 w: m' P4 X5 r& D
- # print(sql)' O0 P& m0 z6 D* ^
- cursor.execute(sql)4 Q% ~+ w5 h3 G1 ~. g V
- ) P: |8 S+ `( p; A
- #qid = cursor.lastrowid
- O8 I! G. E0 Y0 o l% J/ R4 ]0 g; Q - #print(qid)
. w5 H8 Y6 J8 C, d8 y7 O' H. G - 6 I a7 b$ w) S$ A% t2 u, |
- db.commit()
5 q) d6 m8 E Y - except Exception as err:! v5 M) v$ c# a( f
- print("Error %s for execute sql: %s" % (err, sql))
" H F7 N) I% X- ^ - db.rollback()
$ K1 b8 q9 I8 B6 I. A - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
& n+ @# C& L7 s0 F- A - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"). i Y7 k/ {2 H. v: z
- try:
) W1 ]" O5 ?8 E" A1 F+ M4 @, y; ] - cursor = db.cursor()
* w4 {* l3 V4 U7 m0 X+ {# l2 h - #注意此处字符串的占位符要加双引号"%s"
3 Q" Q+ F! X' M4 S0 r - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);% _9 w' t/ w" _1 a3 n
- # print(sql)
$ i7 m/ H1 U' c1 s) x% `& D - cursor.execute(sql): t" G8 p$ k; g( i, P
- & G1 T4 I5 \+ n+ A1 l+ Q9 ^8 d) O. l
- #qid = cursor.lastrowid
$ R0 T, I" ~9 h6 ?$ ` - #print(qid)
" e2 U1 q* y; N/ p -
9 [, G1 r" q+ ?5 m) H& M; k - db.commit()
K* K( P8 F" M& ~) C) j. M! N - except Exception as err:, u, b, p$ q! `9 u, _
- print("Error %s for execute sql: %s" % (err, sql))
! b9 s# S7 B/ j1 i0 W - db.rollback()8 a! i$ X( |# \% e3 Y4 K
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):" r+ Y; c2 S5 I1 O- l. Z* O
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")# J+ u8 I; q3 n9 u
- try:
% z+ J% X- O Q* M3 q# G" o) N9 |( Z - cursor = db.cursor()5 Q3 V/ }, N, Y, U
- #注意此处字符串的占位符要加双引号"%s"7 |8 f6 L: F4 G3 V( s6 m) a
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
# ?# ^, l% r. [4 H - print(sql)# w2 _8 p, U3 r" F! N
- cursor.execute(sql)/ Q+ U; s- ^# }; W) T
- print(cursor.lastrowid)* W, m8 K2 w4 R! x3 e
- db.commit(), }3 I9 u6 v+ h, z+ c/ M# M4 G" [8 \
- except Exception as err:8 {5 |1 }! V+ q/ F9 V9 l, B
- # except:, T& l6 b4 @9 n+ O, ^, Q
- # print('Failed')0 Q& j6 [: y3 ]" s5 y3 G
- print("Error %s for execute sql: %s" % (err, sql))
) [# G) L3 K& ]" o5 o7 X8 g - db.rollback()! w, {; F/ Z5 v! ~" b5 H& L/ A
-
. ~2 Y. |8 B% W+ [: t- [* k$ @ - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
& O( J, Y7 d- `- A - reload(sys): D( O! }/ L, t6 y! a. z, B, F$ z% e2 s
- sys.setdefaultencoding("gbk")
7 Q% ^4 ]4 Y2 t( I8 o0 o- r. c! r - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址3 C$ u: h# w5 U
- locoy_data = {
/ [3 ?, H; O. h9 |. d* V' i( f - 'my_u':'用户名', #后台用户名& `. o0 b( d* n* C
- 'my_p':'密码', #后台密码& ?4 h% _2 L4 W* F' m6 d
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
) o! x. a% u$ I! @2 Q0 w( Z - 'caid':Cater_Name.encode('gbk', 'ignore'),9 T& A: M# q( N
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
% [# p' D* N$ s9 {( C7 K$ V! H - 'article':BookConte.encode('gbk', 'ignore'),
& _; [ R1 Y; n. O: I - 'author':Book_author.encode('gbk', 'ignore'),! e: u: N/ w! `. U- ? l' S' F& w0 E
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
. b& q; ?& O2 `3 T8 p( g$ Z! C. ]% N - 'thumb':Book_img,7 N, D5 E% a6 y* |* K
- 'content':Book_Introduction.encode('gbk', 'ignore'),
# N. M- D% E n. N; d! m% R! c - 'abover':abover.encode('gbk', 'ignore')
$ b9 T. V0 s+ O, F - }
+ L/ ]% p$ q" d - res = requests.post(locoy_url, data=locoy_data)
* C3 `) E3 Y5 ?5 r& g - print res.text
/ Z2 l" K! K5 j/ P; ?# S( u - print res.content
( ?/ y' j- B9 x8 R9 G% J' D - # print Dsd
9 c: w' s3 K6 W - return res3 w% i& E7 }3 G. H* y6 C: {
- $ S5 ]+ _% A3 l& L; `. i J1 P7 I
- def __init__(self):
7 y! s$ i6 e0 Z4 W( d - self.base_url1 = 'https://www.****.cc/'5 H, L: F9 w8 B) e: y9 h6 s
- self.base_url2 = '/'
# h) s: }6 Y8 j - self.CaterId = []
; N0 z* O6 Z, p) v - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
8 x Q, B3 j. t) F, A, H - self.page_num = 1
% W0 y. `1 y* @4 c4 j - self.total_num = 200
7 c/ c- K- z) ~) q8 \5 I6 ~) x" a8 j -
3 |2 S. o' W+ H% ?+ {$ N+ |1 \ - @every(minutes=8 * 60)9 y+ Y; @1 V, l R
- def on_start(self):! A: N+ r3 |4 C. T% O0 e T
- global Cater_Name& H; B" B1 `7 |( ~
- Cater_Name = []3 t3 | ^$ {0 S- q! M* E
- while self.page_num <= self.total_num: , H% J% q& j- m7 J" |
- for self.CaterId in self.CaterIds: P% Y6 ]$ O& K) i A
- if self.CaterId == 'xuanhuan':
% h2 ~) U# l& t! M7 i - Cater_Name = '玄幻'% t8 K9 p5 M' X8 B
- if self.CaterId == 'wuxia':
/ X; w0 [ {: s! c3 V. n B# E - Cater_Name = '武侠'
! _7 K3 A0 W$ _8 Z' _8 f( H8 e+ y: C: W - if self.CaterId == 'lishi':
7 r, Z0 m$ D$ Z - Cater_Name = '历史'
3 S3 | H8 J( h [ - if self.CaterId == 'yanqing':% v6 \9 o# c/ b) `: G5 l
- Cater_Name = '都市' ( l1 L; K7 o: ^# w- p! A3 ` J
- if self.CaterId == 'nvsheng':; I a- p7 p, {9 L8 _
- Cater_Name = '都市'
9 g/ q4 D$ m+ Z* L$ d. B% V3 U - if self.CaterId == 'kehuan':0 q2 I8 w1 ]% ]
- Cater_Name = '科幻' * n2 R; C2 i# g/ X& `
- if self.CaterId == 'kongbu':' K8 \0 j: l, Y2 [' m
- Cater_Name = '游戏' 5 ~" x; E0 b5 B6 J6 ^9 B
- print self.CaterId/ D% {# w8 T+ ^7 p
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
6 F6 ^9 \+ ]% l - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)! ?4 |1 B& g% F0 d1 f' {- K
- self.page_num += 1
& r5 S1 w7 h/ C5 \9 y -
S$ t4 S* h0 W f. }0 I6 s - def list_Caterg(self, response):
8 l* }" p1 N7 S; n& B* C& T - Cater_Name = response.save7 z+ X* ^* y" E/ g3 d
- for each in response.doc('.pic-list a[href^="http"]').items():* P: A5 J+ ~+ t0 ~$ f
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
, O8 t; b1 ^1 E0 X3 F8 l - 3 ]5 P6 a, H" U3 S+ q; E! g* g
- def list_Caterg_detail(self, response):( A0 f% g; r0 C% O6 H
- Cater_Name = response.save
: T7 a+ X. Y m) p - # print Cater_Name% d+ ]1 I% {9 ] `/ E# b
- Bookname = response.doc('h1').text()
0 P% {! A. o6 [1 B: g3 r2 v - print Bookname# p9 Q9 |* V2 `6 c9 G" {
- Book_author = response.doc('.authorname > a').text()1 [. s1 N) B- `( H& z3 m9 C
- # print Book_author# v. y* C& d- W4 w, F1 [) F
- Book_Introduction = response.doc('.book-intro > div').text()4 k$ B* e w5 P9 n
- # print Book_Introduction
5 H0 a/ Q, h5 K e. b - Book_Synopsis = response.doc('b').eq(1).text()
! p; M. A. J7 t4 f& k - # print Book_Synopsis
9 `+ Y7 x: g# L% X+ x - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
" ^. _& u; E! A: X5 c - # print Book_Palabras+ I; Y" Q# S% K9 A2 F
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
- h n, w9 \- H( \9 ?4 q0 ` - # print BookIDs* q5 h- p2 I; `! n2 h% w
- Book_Dates = str(datetime.datetime.now()) ( U L2 V+ m8 ~/ ` B
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
, m& f$ f' m- h" x - img = imgs.attr.src6 S5 L/ h% |! _
- print img
N" z4 Q2 z1 ~# W) W( Z - #小说封面下载
4 x' A" F' l) P: U( P' d2 N: B* }6 n - extension = self.getExtension(img)
`2 T' g$ t7 d - name = self.getname(img)
* f( C7 |1 |& I( j$ S; ? - file_name = name + "." + extension3 [# M$ d% c3 B% O8 x
- imgDir = P_dir + name' @) s% b g! ?# ^
- Locaimg = imgDir + "/" + file_name
" o" m5 E A- z; U! A5 A2 F' ~ - print Locaimg
' E0 f+ J1 R4 Q) h - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
J- Q G( Z/ x, G% `, l$ ~' s. X - print('attachment url is ' + img) #0 L8 w+ P7 r! j+ r# q: K) ] ]
- Datos = {. I$ u# @8 y3 p0 ^8 z% {* [
- "Cater_Name":Cater_Name,; j& C S: C% c- ^+ u3 ^% g8 H
- "Book_author":Book_author,
7 Q! @# J2 d# z- Y# p) \ v$ \ - "Book_Introduction":Book_Introduction,) M) u7 f' i; b7 N/ }" p5 {
- "Book_Synopsis":Book_Synopsis,6 K s) \. \+ f' D0 d
- "Book_Palabras":Book_Palabras,3 r' M; i2 S7 G3 e4 }" K
- "img":img," }9 c* `9 Q: c3 n% x/ D
- }
+ X' m) o( U5 x$ ~, ?: l* ^, R - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布6 s2 `0 H( ^! E) [: X9 w1 ]
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
2 {0 D: K$ X" Q - self.crawl(each.attr.href, callback=self.index_page,save=Datos) ?6 L& J) Z% }0 X' N* _+ c0 d
-
8 E2 ^" T K1 w- r( _ - @config(age=8 * 60 * 60)
# J3 ^% L7 v' L" J - def index_page(self, response):
4 l8 Q' M7 p& g4 K6 U" Q2 m K' m; T - Datos = {; Q; l1 \' F) t) q
- "Cater_Name":response.save['Cater_Name'],9 a" E8 _, O l: Q% B
- "Book_author":response.save['Book_author']," @: u% {5 l, h
- "Book_Introduction":response.save['Book_Introduction'],
# F9 s k6 |- i6 k9 O4 x& j - "Book_Synopsis":response.save['Book_Synopsis'],
& O; f! n) d9 f - "Book_Palabras":response.save['Book_Palabras'],
2 [9 |, { u) \% s3 y( F2 e - "img":response.save['img'],' r* v5 _$ {/ X) F
- }
6 ] t; W: m( i* j1 W# b8 ` - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():* k6 i: X% n' P( h; q0 u& q
- # for each in response.doc('.chapter-list a[href^="http"]').items():
6 m! ?0 @6 Q8 ]9 L - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
! O3 A' M4 j( j6 `4 E - @config(priority=2)
/ k2 f+ K" Z# K4 [, A - @catch_status_code_error% a T/ ~" f T9 p& `$ K
- def detail_page(self, response):
' A5 ?9 i, v/ Z; S$ X" b - NewRe1 = u'哈书') H/ ]& y6 @3 f5 h7 Y) n) t7 c
- NewRe2 = u'huhjsd.CC'
# S6 ?& T+ O" g. k! X' s7 T1 M - NewRe3 = r'^\\n\\n'+ v1 ]3 i- R" j8 B
- NewRe5 = u'小说网'- M% v2 O# X% \! q8 ^( ~
- NewRe6 = u'fgdfgf'$ k# D$ q& D* g( E$ E/ K
- NewRe7 = u'fgfgf'
% U- K% h/ } h# D" X: _# x* A - NewRe8 = u'ffhgf'
0 x K7 [( ^$ r - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'' E% t% F+ W3 e; R% \" V, z9 m# z, N
- ReC1 = u'静思'+ H& ~, g& Q3 s$ Y2 j9 g9 F- ]
- ReC2 = u'aghgf.com'3 o, D* ^" N2 _$ C7 ~6 B
- ReC3 = u'aghgfh.com'$ ]) E' N" l N J7 X- d# e% d
- ReC4 = u''' n0 t3 v% \! f' }; H3 |6 d, r7 j
- ReC5 = u'文学网'% Y, A; ]: K4 S% A
- ReC6 = r'<BR>'
% Z( |7 {, D0 Z; X$ r - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
. _/ Q# G6 J2 e3 U4 ]0 o! K - print Bookname
4 E5 J3 l. @2 O( M- V' g6 a - Cater_Name = response.save['Cater_Name'] # 小说分类6 R/ {2 x6 |; h
- Book_author = response.save['Book_author'] #小说作者
2 c) {) T; p2 a& s$ Y& P |& P - Book_Introduction1 = response.save['Book_Introduction'] #小说简介, q% o0 _ u8 W
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
# m# g6 u( N" Q* \$ h8 u - Book_Palabras = response.save['Book_Palabras'] #小说字数
2 ?6 f9 Y9 N; Z. S, \ - Bookurl = response.url #小说网址
: i8 {, L1 h& c - Booktitle = response.doc('.article-title').text() #章节名称
' ]6 y% T6 ~7 |. B0 q - BookID = response.doc('.readset-r span').text() #小说ID/ L! I/ o; J! @6 \* y+ a# V- ]5 R
- BookConte1 = response.doc('.article-con').text() #小说章节内容 N) x8 m m9 Y; B& | a! t6 z0 z
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
3 s$ B0 o, x; y( _( {2 l - Book_Date = str(datetime.datetime.now()) # 采集时间3 F0 F" ?& G% O8 }( h: b5 _; G U
- BookConte2 = BookConte1.replace(NewRe1 , ReC1) w: v7 z7 M3 C+ g6 x, N6 s! r
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
7 \! Q0 p# p, V - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
4 p( p' ^- c* H }1 g& m - BookConte6 = BookConte5.replace(NewRe6 , ReC2)* K+ ?9 T+ D) c$ D
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
5 Q' x7 K) f( u8 u# Y - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
$ D4 m2 C& [. e, o. w/ `' Z - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
1 I& Z# H0 I3 j$ {/ V; A Q$ o& n - BookConte = BookConte4.replace("\n\n","<br>")$ ]0 y G5 Y, D3 h' @
- print BookConte; D2 p* V; D1 J& x0 f) [+ U* R
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
, b2 o1 [) l8 J7 w( Q# S3 b: ~, y - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
" f: m2 ~* H. _/ C- e - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)4 Y: ^7 Q \% [0 t; Q
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)1 g7 o: q/ Z" n. m
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 8 o- o0 h, j ^% H1 S% e. P
- Book_img = response.save['img'], #小说图片
" [, a( l- z# ~6 H+ k7 | -
8 |. _: O, h: b# o& a - #insert into MySQL 小说入库
- i9 _; D0 p# n - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布" u! ~' ], c9 o$ c
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
7 U5 p9 a- t) p - #post提交发布; h. G1 U0 ]6 U
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消4 K. y( m8 t5 L4 F7 b' x7 {& l0 J' Z
- Datos = {3 {: o/ r% Q. D2 ]! U
- "Cater_Name":response.save['Cater_Name'],
/ `) `- u# ]9 |- [. o - "Book_author":response.save['Book_author'],
& U {* v+ G1 R$ E+ y- m - "Book_Introduction":response.save['Book_Introduction'],, Q! t0 _1 W' m2 {3 F; T
- "Book_Synopsis":response.save['Book_Synopsis'],
3 B& x r( f" F$ U3 f e - "Book_Palabras":response.save['Book_Palabras'],
8 x, x! v& @$ p2 R - "img":response.save['img'],( b, d! x- K L
- }
; r# X) x. k$ r - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
- B3 D( Q: `$ [4 t - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
& x9 f" ~ \# O( |; ?% k# j4 R( V4 u - return {9 A8 ~! d g0 r" E7 d- ^
- "Cater_Name":Cater_Name,
0 n4 X" j& o5 ~- l3 K/ \6 C - "Bookname":Bookname,! t! |9 w& c8 H3 S
- "Book_author":Book_author," S/ u; s# f6 B" F. [- Y; E8 \) U
- "Book_Introduction":Book_Introduction,
7 r- D& \: D+ z D7 T* } - "Book_Synopsis":Book_Synopsis,: Q5 {& C" {: _% d* ^/ S, C% Z' a _
- "Book_Palabras":Book_Palabras,
9 w' Y7 E' \+ Q/ w6 G - "Book_img":Book_img,
/ Y# k3 D% ^4 n1 _' q9 N - "Bookurl": response.url,4 W e5 y) f0 T$ O0 Z L8 H( N. u
- "Booktitle": Booktitle,+ H& L! ? b( m
- "BookID": BookID,1 N+ b, ?8 l T r7 M
- "BookConte": BookConte," D0 X, O% T& y; |% ?# x# W& ?
- "Titleid": Titleid,, p; C8 k. Q0 N5 `* c* Q# Y
- "abover":abover,
; u" N0 }# c2 s( x5 \ - # "Book_Date" = str(datetime.datetime.now()), _! Q1 ^$ h+ k/ l5 @3 D/ Q" q9 S
- }6 Y, k3 i; \3 O- s
- def download(self, P_dir, imgDir, file_name, Book_img):3 v' e! w6 I7 M6 u7 D8 @
- if not os.path.exists(imgDir): 3 ]" N0 R( k' m+ C5 m( M
- os.makedirs(imgDir)
. S2 G$ C6 x& l - file = imgDir + "/" + file_name0 Z/ R) [; ]7 Y( ~
- # print file
A+ w* g' [& c8 B8 s; G - f = open(file, 'wb+'), g( J. V1 t& U O0 ?4 K) |
- imag = requests.get(Book_img)
; n! z1 s+ o7 Z+ `4 K - f.write(imag.content)
, J1 [' \8 T, f* G. W- X( R - f.close()% {9 c6 u; w6 V" @3 q9 J2 c
- #保存图片前9 M- I; T5 E3 p; o* B" E& |0 n* N" y
- def save_imgs(self,response):
( k5 ?% b" f+ ? - content = response.content
, t0 `2 z; {5 s3 ^# o) C1 w, c& A - file_name = response.save["file_name"]9 W4 }) H7 D1 y
- imgDir = response.save["imgDir"]
! o. e0 ^( S: Y+ U0 ~$ E1 E1 h - file_path = imgDir + file_name
4 h7 H3 I2 J x4 K - self.save_img(content,imgDir,file_path)$ @: \5 C/ r: @ | @: ]
- #保存图片
. f9 E3 U' Y. S0 ]# x% O" K - def save_img(self,content,imgDir,path):# b9 [& v0 E: x) e& m$ D
- if not os.path.exists(imgDir): / q6 j& |0 L5 q6 u. Q& k
- os.makedirs(imgDir)$ L7 ]0 l: a5 J4 L$ f! B/ ^( z) u
- f = open(path,"wb" )
1 |/ ~! f+ f1 ]- G - f.write(content)" ]" ]5 h% K, M; ^( F' k" [
- f.close()3 x) H4 l y" }. N8 H/ F7 |
- #获取url后缀名/ {! _& |9 B' \
- def getExtension(self,url): : l+ |/ p. y( x
- extension = url.split(".")[-1]
) \+ |' f' f; n/ d: s( z- D - return extension 0 }- j& p% C8 L* U& U, V7 b, l
-
; {8 c; }) u3 ?' a) e. |# v' j - #获取图片名
7 n* W& W# Q C# @( } - def getname(self,url):
- J+ ], R0 z4 R. L5 d$ e W - name=url.split("/")[-1].split(".")[0]
; F) p9 n3 ]9 P# }. S5 ~ - return name
复制代码
. w) o2 p) e3 b) F+ a) c
4 t2 p, W: Y" w8 _$ [ |