Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
( Y0 O7 M/ c7 h) q; u+ z- #!/usr/bin/env python$ \9 _* F2 ?5 [$ l; i3 s% w! ]. O
- # -*- encoding: utf-8 -*-* i. i) H8 y- |% }; p' s1 X5 v
- # Created on 2019-05-05 21:43:11
1 k1 ^. H6 x7 e7 X% Z+ X - # Project: XiaoShuo
% L, {; f; I3 O3 X1 N1 x6 g7 X -
* b8 e: z# v* R+ v7 W - from pyspider.libs.base_handler import *+ V. \; g. V4 e7 l$ t0 L
- import pymysql
5 A; p# P7 b' h* {+ K! V0 E% S - import random
7 ~% i& x5 F1 R - import datetime
( I9 k. o" i7 j - import urllib2,HTMLParser,re& A, [+ P) K3 K- p O
- import os v) B. m, P; J: z$ K
- import sys% y6 r- K: S f4 b
- import re9 ^6 N4 I3 @" d/ s* o/ p
- import codecs
0 z7 f5 V, l, v+ @/ _, q. f - import requests
- l& x$ P/ M4 E3 C: o+ W4 c - import json+ e- X# p$ H9 S H9 _
-
( V/ B; o+ S5 I; u. h - class Handler(BaseHandler):/ N& }6 m1 q1 L6 P) i
- global Datos
% \6 G& j' L; ` - global P_dir $ w# m8 M# }" @8 ?! u
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
! Q9 F- `# L) U3 U- L% c* H( M* J - global Datos. C l* t0 C* F% D7 V# M; G
- Datos = {}
/ M+ }0 I6 `* G9 a - headers= {$ T, k2 `$ `) K' m
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',: l( e3 O! e( J8 |
- 'Accept-Encoding':'gzip, deflate, sdch',* `, E* r4 y# b4 y! x1 K% ~
- 'Accept-Language':'zh-CN,zh;q=0.8'," S, J" `7 G; [7 X! \- @4 ^
- 'Cache-Control':'max-age=0', R" [& H) ?$ D0 W+ z
- 'Connection':'keep-alive',9 c/ q& x' v8 A6 M% f
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'* d% B( v7 {7 ?% M
- }
. \4 K. z, v. K2 S! f4 Y - crawl_config = {
# s; b+ F/ t0 n - 'headers' : headers,0 O$ G# k3 _( E( K( I7 y
- 'timeout' : 300
- ^' F' w% X% n( m/ t; s* }5 Q - }; R3 x& g6 |! V
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):3 A1 M( z8 C8 e& l5 D" j) u
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
7 ^& E p# O& E+ q2 V - try:2 ?6 h8 v/ |1 Z
- cursor = db.cursor()8 H4 m; }/ L P( T& W8 Z; ?! y9 t
- #注意此处字符串的占位符要加双引号"%s"3 d' R1 X2 l" w' N* I7 X* X
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);! |$ e+ R0 Z+ W& z7 x" f1 W
- # print(sql)
5 a( Y$ q' N2 |1 j/ |& d { - cursor.execute(sql)5 O4 i i1 [2 q+ S: T, w1 H
- $ m2 N' {0 ?5 ~. d8 N3 }& f5 T
- #qid = cursor.lastrowid. x3 }2 c0 H) l) d6 C
- #print(qid)
/ f' e# r8 x/ q3 k9 ?& r4 k/ Y - % V0 m. f1 s3 X6 e" E
- db.commit()7 `$ A- Y9 P' U; D
- except Exception as err:, `) B" s3 O' C6 V( w/ o
- print("Error %s for execute sql: %s" % (err, sql))
0 i8 _9 R* o/ |. n3 U1 b8 s: B - db.rollback()' Y3 x- {# G8 `) u
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
$ W0 ^$ v: d( m$ S! w. c% s - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
" @" ?8 F3 T* y+ f3 V - try:( J) k& ^" Q v E/ a M K0 A
- cursor = db.cursor()
% X7 s. W0 M/ R: P! q2 R - #注意此处字符串的占位符要加双引号"%s"
: X5 n @5 H6 N5 Q3 S' h - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
: l1 s* }% j5 O4 l9 `( `4 I - # print(sql)! r5 Z7 Z/ G% e, i/ u) ?
- cursor.execute(sql)
5 o' X3 A/ X F3 i. r -
& G3 Z* K# ~4 g( b( Q; o - #qid = cursor.lastrowid6 } R9 ~9 v- i$ K
- #print(qid)' E2 M3 J) ?& Y$ o
-
+ B$ ~2 k; q# v. Z4 w - db.commit()
! Z3 o' U% z( }6 X4 S! P, W* n - except Exception as err:" C) i# B/ n0 j4 _; d) c
- print("Error %s for execute sql: %s" % (err, sql))
- s7 s1 q9 t* M9 u9 y - db.rollback()
9 u$ \3 Q3 n, C/ P - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):! s8 ?" j5 W* K4 Z1 B
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")# w$ z9 x2 _7 {
- try:0 `/ v( `: T; O6 [) D( M" `
- cursor = db.cursor(): L& g3 `: C+ ^9 ^
- #注意此处字符串的占位符要加双引号"%s"7 L+ c0 y, R" l3 h
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
7 y* `7 Y: c( x - print(sql)9 B) L9 _- U8 A2 ]
- cursor.execute(sql)
6 J {; O! F9 Q) m% j" P - print(cursor.lastrowid)9 y Q4 c$ w/ K/ {+ N! T# ]$ R4 {9 h
- db.commit()# x a9 Z$ T# O) B) f/ B
- except Exception as err:
1 `0 d7 X# ?1 N3 P7 j. S - # except:3 @. C0 [* s, S( h
- # print('Failed')5 R. S: u" M1 w) H% S, q
- print("Error %s for execute sql: %s" % (err, sql))* v) Q/ m3 a# _4 e+ h x1 Q+ |
- db.rollback()
* a1 t6 v( H5 R' [# ?9 H - 7 H+ _. X# S* u- R
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 4 O; R2 x- J& f, w* V9 r" x5 V5 I
- reload(sys)
# p) |6 a6 @4 K! E9 H - sys.setdefaultencoding("gbk")$ N8 T+ e6 ?$ e$ d
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
/ ^) a2 Y R6 n$ O! d - locoy_data = {- P1 B: \( R% ^/ J
- 'my_u':'用户名', #后台用户名
$ ~. |4 W6 f7 e M X! d - 'my_p':'密码', #后台密码* }+ \5 ~8 s4 O* B* g# `+ b. k6 d
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
9 S( u, S# X& c! A( A; ] - 'caid':Cater_Name.encode('gbk', 'ignore'),' ^7 R+ E# |- y F) o2 F: Y/ g
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),6 L# P$ n0 z; E
- 'article':BookConte.encode('gbk', 'ignore'),& t G6 f& O. a! `6 W+ T# Q8 |: V+ ]
- 'author':Book_author.encode('gbk', 'ignore'),
- ?- W( ?0 @# F: G - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
7 z* I+ b6 S9 T* n - 'thumb':Book_img,, ~9 ?, U0 y5 R) V' G
- 'content':Book_Introduction.encode('gbk', 'ignore'),: B' C4 N! L, G
- 'abover':abover.encode('gbk', 'ignore')
- Z. N+ D2 H9 t z7 t - }$ n8 e' Q0 k9 ?4 D
- res = requests.post(locoy_url, data=locoy_data)$ A1 _( _7 t% M' o7 V3 `
- print res.text3 j9 i% x8 z; g; M; L& C& Y
- print res.content
- Y B& ~# @3 T. X( v9 [ - # print Dsd6 h2 Z1 O4 m( J. T2 \4 |
- return res# ]5 @ V7 E, H! ^% A* N4 j
-
( ]; L# W% I9 A$ \# [4 N( h - def __init__(self):
# N- `; V0 g9 I) s( p9 l - self.base_url1 = 'https://www.****.cc/'
. M- n5 c1 I @: G: U - self.base_url2 = '/', ?9 R( e+ t i3 e
- self.CaterId = []/ x' V* E# M# I, |, s# B
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']1 t& D$ P6 c0 p: f$ z4 n
- self.page_num = 1
! l# H8 q: z d& W7 a+ K - self.total_num = 200 & }4 c9 W# Z$ w* R" x4 o, i
- & W6 L" {% X9 t) l; x' {% V
- @every(minutes=8 * 60)
" e/ _- U; X- }/ m - def on_start(self):
" u; t% k7 h" W B- G; f8 A - global Cater_Name
% w; z# [4 R0 W" L( o - Cater_Name = []( E, ?8 y0 d0 [. x4 }
- while self.page_num <= self.total_num: # T( S1 b0 x1 ~
- for self.CaterId in self.CaterIds:
, \" s4 v; j2 \5 E2 R - if self.CaterId == 'xuanhuan':
. Z9 U- w+ z+ h5 D2 h - Cater_Name = '玄幻'* ]4 \' i7 z/ J6 b5 `
- if self.CaterId == 'wuxia':% P& J% P9 X* M' D+ u" |6 ] b+ ?5 V
- Cater_Name = '武侠'. d+ T- Z. d2 H% P( h
- if self.CaterId == 'lishi':
5 A8 g0 J& p9 A" Q* n# C. p - Cater_Name = '历史'
1 v7 P9 O7 a/ V0 S- w* d - if self.CaterId == 'yanqing':1 ~0 [1 V6 D n
- Cater_Name = '都市'
6 u1 @1 m% E9 Y+ S# S9 ~ - if self.CaterId == 'nvsheng':
$ m, O6 b- `$ u! k/ z, P0 Q" m9 O - Cater_Name = '都市' # z! ^ T! N5 E# P
- if self.CaterId == 'kehuan':0 }4 R; N. ~$ X( r$ B/ r
- Cater_Name = '科幻'
# @) Q2 U) D* c% U. ^ - if self.CaterId == 'kongbu':/ v5 ]0 k0 n# ^0 Q; U
- Cater_Name = '游戏'
1 F$ v, q2 }2 b( f5 \/ \ - print self.CaterId, o, i2 R( g9 r! \( O- r+ K
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" $ F, b2 X! H: c
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)! X, R/ x" r# E) N9 z
- self.page_num += 1
. u+ v* w1 ~0 t5 Y+ y8 ~ - 8 F, h; x5 q& e. J% l. N
- def list_Caterg(self, response):
7 h1 `: v' C. l( o) A4 f$ j: j - Cater_Name = response.save
2 m0 H/ c( I! z* I2 T - for each in response.doc('.pic-list a[href^="http"]').items():( s1 V7 v5 M7 Q% k
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)/ ~7 C: o4 B( i2 h2 P
-
! C9 g& t7 ]" C3 s- ~ - def list_Caterg_detail(self, response):. l$ M5 r1 m4 p
- Cater_Name = response.save3 T/ L; X- v0 }) }' F$ s
- # print Cater_Name
9 w; F% M. _* J8 O - Bookname = response.doc('h1').text()
$ z" k; [6 F/ q - print Bookname
! a( h H% g6 M5 y" p - Book_author = response.doc('.authorname > a').text()
- x; R9 w( ^3 r; j9 S - # print Book_author3 q1 E9 i/ f' V- V* w9 \0 B
- Book_Introduction = response.doc('.book-intro > div').text()
. I) t# a6 p) |2 `1 |( b6 W - # print Book_Introduction
4 n G& I; {/ g - Book_Synopsis = response.doc('b').eq(1).text()
- B4 L3 _. T" I# y0 B - # print Book_Synopsis
0 E6 \" V: X5 |- _$ w0 ~ - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
b2 V3 S# O }! \ - # print Book_Palabras6 @+ E1 `( Z9 j" n4 \, a% w
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID# b" R- l) W9 a j$ g- \7 s
- # print BookIDs6 A1 S1 a; C @; I
- Book_Dates = str(datetime.datetime.now()) 8 ^8 V B) s: D9 ^% N V
- for imgs in response.doc('.bigpic > img[src^="http"]').items():# J+ K0 G8 o( {2 M: q2 f+ o
- img = imgs.attr.src
! k4 p: B. s# v/ b$ `$ J - print img9 R1 F% ~9 Q0 ], t
- #小说封面下载! P3 U9 Q- z4 ~# m/ o
- extension = self.getExtension(img)
1 j7 O! e% R2 o( c3 v0 d3 ?$ X - name = self.getname(img)' A+ P6 P/ \; @) d5 W) P8 |
- file_name = name + "." + extension& b$ D- X5 q3 Q: E, M0 t1 \3 N
- imgDir = P_dir + name7 e: k8 [. F# P
- Locaimg = imgDir + "/" + file_name' d B3 D7 H8 m+ q
- print Locaimg% K3 D, @: T: z
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地' x0 Q6 X/ s2 J0 ^9 x! a
- print('attachment url is ' + img) #& D, J, | b% o6 i! s2 M+ p k4 H
- Datos = {7 i! J D# X( z1 ]9 }
- "Cater_Name":Cater_Name,7 e/ J; Y, i9 Q: A
- "Book_author":Book_author,( d5 r8 r7 O: e1 L5 l0 k D' t
- "Book_Introduction":Book_Introduction,
9 ^5 j i- s% R/ R! t) d4 D8 U - "Book_Synopsis":Book_Synopsis,( D6 k V0 V$ r0 m# o( R
- "Book_Palabras":Book_Palabras,
" q" z1 k; l/ z8 S. _ - "img":img,
8 ]7 ~. l! M0 w; S; o3 j& i @ - }
9 I4 c m: e6 |# k8 E2 C - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布1 ]- M2 G( G, n) @7 m0 z# v+ _
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items(): G+ ~, H# N7 ^( {" a, w
- self.crawl(each.attr.href, callback=self.index_page,save=Datos): i6 ^# D% C$ \4 A3 ?
-
% Z/ F8 ^% o3 r3 m - @config(age=8 * 60 * 60)
* _; \1 U; T+ k1 M5 W) M+ F - def index_page(self, response):
* x/ e T. n/ X5 f; { - Datos = {
( X; L+ T; [5 z& x$ O+ k - "Cater_Name":response.save['Cater_Name'],
_ |3 J3 @' j: p - "Book_author":response.save['Book_author'],
) ]; G* N* l) A& A, b$ @+ {7 d - "Book_Introduction":response.save['Book_Introduction']," ~, U9 B# f. u: P$ o6 d' q/ l E
- "Book_Synopsis":response.save['Book_Synopsis'],- K! p Y) j, {5 y7 |4 |$ k# o+ n
- "Book_Palabras":response.save['Book_Palabras'],
$ R8 s" x8 m, G2 Q - "img":response.save['img'],
9 Z" x% b# c% H+ l* |! @" |: s1 y - }0 a8 l# b4 S, b) r
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
: M# a, q0 V5 v1 M - # for each in response.doc('.chapter-list a[href^="http"]').items(): 4 {& F3 ?4 c0 W$ f9 b% r
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos): S$ b1 D0 F# F- y* v
- @config(priority=2)
. n( ~/ U+ R0 s3 K$ k+ X( U - @catch_status_code_error' D, ?( X7 R3 A0 A+ n. ]
- def detail_page(self, response): 6 w8 e" o" \( A/ o' N
- NewRe1 = u'哈书'" P/ L7 ?1 k, _" v4 o
- NewRe2 = u'huhjsd.CC'/ Q# T! Y- x5 t0 B3 G" U
- NewRe3 = r'^\\n\\n'! c, t4 {- c# C z8 x7 L) M0 J) J% P
- NewRe5 = u'小说网'' ~% ]2 g7 k, `2 h' \2 t; W
- NewRe6 = u'fgdfgf'
, _; [' O9 u( @ - NewRe7 = u'fgfgf'
+ q! P/ P+ W7 Q. w - NewRe8 = u'ffhgf'9 O7 _0 ]$ {: C) }/ ]* j" O
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'# n! t' }8 V' V; c$ R
- ReC1 = u'静思'
& h2 {: r* O# N. N P - ReC2 = u'aghgf.com'
! H# P x) U# i$ ^1 d m8 J0 Z - ReC3 = u'aghgfh.com'( y8 Y& c. B& w/ R! V/ `
- ReC4 = u''
3 A D- x9 v. \' ]2 C: l2 i( ]0 A$ g - ReC5 = u'文学网'( o9 ~7 A5 v, x+ C0 Q
- ReC6 = r'<BR>'9 \) L5 K: P' M5 b' w) [" w) Y
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称" Q9 Z) _1 ~9 \
- print Bookname
5 [1 x. h1 ?' D1 j- f/ |+ P - Cater_Name = response.save['Cater_Name'] # 小说分类
2 W* S: L6 o" Z - Book_author = response.save['Book_author'] #小说作者8 S# z& B- h: Z/ k5 o
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介: |) o! I: T+ y& I: Q7 K- M! y, m
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
2 Q- f. i; ^7 ^6 n - Book_Palabras = response.save['Book_Palabras'] #小说字数, \0 r! B4 q* ^. }
- Bookurl = response.url #小说网址# `: G0 o- L* F
- Booktitle = response.doc('.article-title').text() #章节名称0 C/ |( x1 P& r. u# ?& O
- BookID = response.doc('.readset-r span').text() #小说ID
, L3 K1 `+ j0 c6 A7 j7 O; G3 d - BookConte1 = response.doc('.article-con').text() #小说章节内容" q9 T$ F! o" Z6 ]. V
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成): x; V3 Q; T8 b
- Book_Date = str(datetime.datetime.now()) # 采集时间
/ U. Z3 ?! E; v* x+ z# V - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
6 |) W6 w" g, S b& a - BookConte3 = BookConte2.replace(NewRe2 , ReC2)" E! g* a$ k+ J: l! i
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)8 [; ^$ x4 z. _: _9 W8 S5 `
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
" Y0 l4 F, m5 P! N$ b- M - BookConte7 = BookConte6.replace(NewRe7 , ReC2)! O3 | P+ e; D2 }7 L" `
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)
' o) T0 c6 H q - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)9 n$ i7 A+ c/ e. m" D! K6 |- T
- BookConte = BookConte4.replace("\n\n","<br>")
# E1 t S. ]" F2 r+ M - print BookConte. z4 C: ?* g1 L% F$ r
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)$ ]3 H" _4 `' W; m% G D
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
6 Q3 {9 L, S& d6 ~5 Z7 t) v - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
: D; v f; T1 K" r - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
5 |9 L3 U$ \2 i0 ] - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ' {! i) ?" z- N$ P6 N2 L
- Book_img = response.save['img'], #小说图片
+ U9 ~4 w$ w, `& u - " ?: f! g8 e( r7 w2 [
- #insert into MySQL 小说入库
9 g0 ]4 `4 ]9 ]3 i( w8 q7 F7 s - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
! V( \4 x' V6 k' N, j - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布8 d1 p$ z1 X* Q
- #post提交发布
. B1 O) _1 f& H s+ L6 j - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消! b! M& B" N8 e- k; p
- Datos = {
, T7 `4 K) J x |* \; a - "Cater_Name":response.save['Cater_Name'],
$ T+ Z; v" o3 n2 c0 c5 ?/ h - "Book_author":response.save['Book_author'],
g# D" [ \! ]& g - "Book_Introduction":response.save['Book_Introduction'],
; S& W1 F6 ^( o# z% m1 b. j - "Book_Synopsis":response.save['Book_Synopsis'],! b7 |6 \$ O7 \
- "Book_Palabras":response.save['Book_Palabras'],$ e' Q8 L- [) U3 F* R
- "img":response.save['img'],; ?% ~! C+ u/ x( ^( b' H0 }1 Z
- }4 y2 d5 v8 W1 r
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():9 |9 Z q4 _; G: S0 F" F
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 3 s& j& W4 i( Y" U3 ~
- return {3 {# P3 T' s$ v- A; j+ D
- "Cater_Name":Cater_Name,6 q v" L, H* ~% F6 {% x1 a- \# S. j
- "Bookname":Bookname,
9 X, G' Y0 D- \7 X1 l3 N& s - "Book_author":Book_author,% Z3 ]- O# R* o- a5 t* L! k, G
- "Book_Introduction":Book_Introduction,9 y1 B0 w! a$ ~
- "Book_Synopsis":Book_Synopsis,
1 O% Z5 }" E9 m- q2 | - "Book_Palabras":Book_Palabras,
# ~0 O7 Q: n7 M - "Book_img":Book_img,
F- A" ?7 E2 ]3 y - "Bookurl": response.url,) V2 O, i. a% p6 S2 c
- "Booktitle": Booktitle,: i) t% @. g: f/ g
- "BookID": BookID,
" f' @! f6 R% S9 q+ q9 { - "BookConte": BookConte,* I' c) p. q, Z e/ J/ J3 @
- "Titleid": Titleid,4 _; n) r( K' b1 n
- "abover":abover,$ ?7 F4 x8 }2 ? b* ~
- # "Book_Date" = str(datetime.datetime.now()),
r Q. F: M0 I - }
% X0 Q z I" h4 q - def download(self, P_dir, imgDir, file_name, Book_img):) l5 I( F z b. ]) q, {
- if not os.path.exists(imgDir): - _" x. @5 M% l" U
- os.makedirs(imgDir)% V9 K2 P" U! n: E6 J" {
- file = imgDir + "/" + file_name. o4 d- R; P% A7 M3 p# a6 J! z, k" w
- # print file
- A6 Q9 L! q: U' U1 Z; C. j( k - f = open(file, 'wb+')* R6 C4 R- l( i7 D$ R ~( ^
- imag = requests.get(Book_img)
0 G _( i$ M2 `; W) X/ X Q0 x' R - f.write(imag.content)3 {8 J7 Z9 g; l- j9 g
- f.close()
& N- g# ^* E* n3 R( T ? - #保存图片前8 f7 w6 A; M3 e0 A- G. L" {0 d
- def save_imgs(self,response):
, j# m# T. R7 O; b+ R" K0 B - content = response.content
/ x1 e" @% n: C" H$ x( _ - file_name = response.save["file_name"]
# g) K% c# ^% M& j" e* q - imgDir = response.save["imgDir"]& I" a) a4 R2 S, q$ k
- file_path = imgDir + file_name5 N$ ^ r$ {/ H" p: c! s
- self.save_img(content,imgDir,file_path)7 c& e" S6 S1 K8 e% Z3 P
- #保存图片, B+ `! _$ z3 W/ N/ S
- def save_img(self,content,imgDir,path):
) N/ i: N+ [ Y6 o @ - if not os.path.exists(imgDir):
! ~: {( t- i1 H6 P) ? - os.makedirs(imgDir)
+ ]& I# e7 T/ n4 f9 K - f = open(path,"wb" )7 O0 P5 i4 h @, S, K
- f.write(content)
+ C, m3 f* b) A( x2 ~" z5 t) b - f.close()
1 V" h0 u9 W7 T% ~0 `* j( S' ^ - #获取url后缀名. ?1 ?! x6 |! R) E2 \2 c; ?
- def getExtension(self,url): ! V4 @, I. W, B4 `+ L. l! ^
- extension = url.split(".")[-1]
, n* r( P% e1 t$ F, d2 u% ~ - return extension
/ f2 A$ D* O! o- H S -
1 v# u o o3 r$ }$ K- W0 e, r" v - #获取图片名
8 h8 ?- i. Y$ Q) G Z: { - def getname(self,url):& E# U3 W1 p+ N' t
- name=url.split("/")[-1].split(".")[0]
/ g2 b' v9 U0 H+ G2 ?9 y: I1 v$ P/ z. d2 l - return name
复制代码 9 S# H1 X. l2 u3 {$ n
. i! R8 [4 `9 K, Y
|