Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
9 f2 |# v2 |0 d8 O' o' K2 P3 d3 p- #!/usr/bin/env python
0 z, }6 j8 {1 Q - # -*- encoding: utf-8 -*-
+ U) a6 f/ n) A3 Y - # Created on 2019-05-05 21:43:11
1 R: f- c! s7 L; G% e7 m - # Project: XiaoShuo
. s3 _9 j/ g; p - ! ?5 r( v! K i+ V+ V4 L
- from pyspider.libs.base_handler import *; w6 E4 M9 T/ U% k
- import pymysql
( P1 i* x# f; C# b( f - import random" I( h$ r6 U" a& \$ U% B9 u+ ~
- import datetime3 P( B$ u2 Z+ j4 v* ]
- import urllib2,HTMLParser,re6 S* a' z) s! |! a
- import os* D: d# r3 g5 i2 _' d
- import sys3 y2 j/ q( @5 ?9 S( f- V' h4 c2 n
- import re- @% p4 e2 u; G5 T; `5 W
- import codecs
6 h$ C8 T3 n1 V) k; O( p - import requests
* g% f1 W1 I( ?- k* { - import json
4 p; G9 Q! s# V/ [, g N9 y -
9 G9 F6 ^: {$ E" N: D - class Handler(BaseHandler):3 N3 R) \- K5 O+ A
- global Datos
5 b# w3 x9 m) {% S4 @, h# P7 [5 x - global P_dir " a8 B5 r3 S) \ O: y, D+ D
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
6 p6 B7 t( G' A5 [9 p - global Datos
' C2 _0 u; \$ ~ - Datos = {}* ?- P9 F, C' z4 X, L
- headers= {
; S8 Q2 ~- \: f0 B" j6 | - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',* s5 f' i |; i
- 'Accept-Encoding':'gzip, deflate, sdch',
' q) I C) U) N" J' @ - 'Accept-Language':'zh-CN,zh;q=0.8',
, Z' m5 u" D$ i6 ]7 ^ - 'Cache-Control':'max-age=0',
% K/ s: m* D: |& K3 v5 S - 'Connection':'keep-alive',
1 o9 P. T* h! b* g0 @, o7 S - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'# Z8 W7 Z. L1 ?' }3 E
- }6 X% T/ N4 ?+ S* v( J# g
- crawl_config = {0 {( n& M2 R, z: B
- 'headers' : headers,
& n, f1 s) Q* u' a4 r - 'timeout' : 300
0 Q# D+ C \. `2 @9 R - }
' E4 j' l1 H y) b: c& ]4 N# x1 \( e) W - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
( @9 s# ^& ]5 g& O# n - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8") w4 V# v7 Y5 J/ O& |
- try:
& A, u5 q% p) O - cursor = db.cursor()
/ W: p9 V5 Y% D9 V2 W. k3 k - #注意此处字符串的占位符要加双引号"%s"
9 f0 m* j) W% R! h4 q - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
* W& u) ]7 n$ H - # print(sql)2 v: P* O" O7 Y: I+ R
- cursor.execute(sql)
4 ?4 _) ~, T7 V -
1 A, _4 |4 g0 [2 u, t7 O' i. G - #qid = cursor.lastrowid
5 v9 H3 Z) n( s: e6 Q; {! c( i - #print(qid)0 L% |" ]; |2 ?. q" f$ o
- : G# N4 u. A1 @- a9 n
- db.commit()
9 x% S: z W1 c - except Exception as err:
7 m2 ], C, j! U- ?) R* y - print("Error %s for execute sql: %s" % (err, sql))8 ?" m8 ^2 @# j, v c
- db.rollback(); q/ y2 S' X& ^/ J: C4 H
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):1 m: L3 }/ c) ]- U7 M! y% r. T# V$ S
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
2 w8 O0 `, K6 S' ~ - try:, }' U7 ?1 H; T8 d$ u4 Y
- cursor = db.cursor()$ H( W# {, P& ? H
- #注意此处字符串的占位符要加双引号"%s"( E! b$ ]2 v# W
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
$ o8 {- X9 f; f4 w - # print(sql)
( L6 \ b+ R9 B$ g {/ D3 Y" @ - cursor.execute(sql)
. A& y% E) T5 X; a5 I -
6 ^5 e$ c% ?& s/ y3 r n3 U% x6 [ x - #qid = cursor.lastrowid
+ d1 V0 l& a i" | - #print(qid)7 h$ B& w9 N/ A4 S; z% ~
-
7 C! }- @- A9 O6 y$ ^4 @( E - db.commit()
2 F' K& Y. J2 V9 V3 B; n - except Exception as err:0 |% p# E+ X8 N# F) A
- print("Error %s for execute sql: %s" % (err, sql))+ W& I0 v" B) e* X+ `# [" G
- db.rollback()
( i0 K5 s1 \9 I1 g - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):9 ? h2 t+ f' H S5 O
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& m Y$ b) e* @4 r7 @, Z/ K - try: X& ~0 F$ Y+ \5 d: r( W
- cursor = db.cursor()
9 x% G& B( Q9 R - #注意此处字符串的占位符要加双引号"%s"
9 g: }! W0 U7 h7 S1 O - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
& w9 |+ n1 @. f: ]6 a8 R - print(sql)
! e7 ~7 M* b1 D" D - cursor.execute(sql)
0 e5 ~, g w. o7 e0 y# w) U: n* r4 D( A - print(cursor.lastrowid)
/ q; Z5 S+ j# l7 f M0 n - db.commit()# H i+ w* }- B! u, P) K, p
- except Exception as err:
0 `9 @) ?6 }9 x: l# o8 W - # except:
( ~9 ]8 G h' [: o0 }1 S& t! z- g - # print('Failed')
4 t& i+ ~1 v* N3 B+ X - print("Error %s for execute sql: %s" % (err, sql))
* a. h. t4 D$ B - db.rollback()
1 W' C1 X+ t8 c -
/ o' x+ y! U$ E, ~ - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
4 J* X% ]- y; }, F5 i }$ q - reload(sys)" E* G0 ]$ Y! _( z
- sys.setdefaultencoding("gbk")
3 U5 Q: u% V2 S5 u# r! n& |* Z - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址0 { c T& {; b6 K7 E& U3 G
- locoy_data = {
. E' x% M+ ~/ A/ c! f5 z - 'my_u':'用户名', #后台用户名0 X0 J6 m: l# w8 h9 K) U/ s# m
- 'my_p':'密码', #后台密码
% G, h- X0 Z6 j& B0 { - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
7 ]" H! A; B) `9 X' J8 \ - 'caid':Cater_Name.encode('gbk', 'ignore'),
/ L1 R! G ~' L3 p - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
+ N. H/ C8 g! b6 l9 R1 K: n6 {8 e - 'article':BookConte.encode('gbk', 'ignore'),
0 P5 t8 G' v. {3 M* E - 'author':Book_author.encode('gbk', 'ignore'),% g1 n+ B2 [, v9 h! {
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
1 ?( f4 @4 x: U/ a7 `! S( w/ S! Q- k - 'thumb':Book_img,' t$ f' S1 H, C* q4 c* n
- 'content':Book_Introduction.encode('gbk', 'ignore'),' P, P3 x4 d. B+ V" h2 y) `- n, A
- 'abover':abover.encode('gbk', 'ignore')
+ m7 a7 [2 J. c, t$ f" ^ - }( s5 e& ]( ^+ v7 c
- res = requests.post(locoy_url, data=locoy_data)$ H7 G/ G: f" J$ e
- print res.text4 u2 L4 O+ N% Z- Z, W. t
- print res.content
: Q, l( g2 ]; C+ n& P - # print Dsd
6 L- S& W3 R) \" a/ O - return res
/ E }6 S- x% y U7 G2 D' J1 Q -
4 r p* I2 R$ C6 U - def __init__(self):
; }$ l! G7 `! V+ M5 I% Q - self.base_url1 = 'https://www.****.cc/'3 O6 Q8 R- S' f& ?% u; i
- self.base_url2 = '/'% x" S( Y) D1 v4 F) g. _
- self.CaterId = []
' @5 q. J* O+ P/ N4 p( U - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
% X% e. \/ ?( u: N+ l8 I - self.page_num = 1
3 {- D" j8 S7 }$ W - self.total_num = 200
1 O# I$ C$ Z* i; D9 k2 Q -
- Q2 a/ r/ s0 l0 E/ ~6 w0 e+ h2 c3 w - @every(minutes=8 * 60)
# n8 v, _# K4 @" B* x- J! ] - def on_start(self):( [& j1 O- b6 m/ {
- global Cater_Name
: A; e. {, D1 e: n6 |) m6 F - Cater_Name = []+ Z2 G0 r% F+ X' ~. V) V
- while self.page_num <= self.total_num:
; v* M& Q$ _: K - for self.CaterId in self.CaterIds:
# v' ?- d e& r% f# t1 V; b - if self.CaterId == 'xuanhuan':6 J* L3 Z' s d
- Cater_Name = '玄幻'
j$ Y4 ]( b! e - if self.CaterId == 'wuxia':
8 N, Y, }1 ?8 B0 ^ - Cater_Name = '武侠': U$ H1 i' k9 t$ k1 _; w1 f9 Q" ?
- if self.CaterId == 'lishi':$ q$ @( u$ o: z( |, e9 B+ A
- Cater_Name = '历史'
3 G& q" p) I, V. e3 A - if self.CaterId == 'yanqing':* n+ t9 N3 H# j
- Cater_Name = '都市'
z( M$ a" P w( S; F: H - if self.CaterId == 'nvsheng':
" U( A! X# H, R# p3 O( w# Z { - Cater_Name = '都市' 7 ?" h2 \/ H9 \7 ~' @
- if self.CaterId == 'kehuan':7 F" j% m& O6 [$ Y4 ?( z6 X
- Cater_Name = '科幻' $ M$ o Z# y9 ?: Q
- if self.CaterId == 'kongbu':/ O1 s! j3 G# [% t: _1 W5 I2 q
- Cater_Name = '游戏' 3 ~$ W( L" S; _/ e- J3 r6 S
- print self.CaterId9 P9 a' Y1 U$ T( I
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
% \( z# }! g& [9 F. T$ ]3 C! u - self.crawl(url, callback=self.list_Caterg,save=Cater_Name). V6 f+ f5 E" E2 o7 _! p
- self.page_num += 1 ) j( p9 }0 [* o8 |- K
-
. ]6 |0 p8 W0 q, m$ G6 ]; n - def list_Caterg(self, response):
" \9 H# I2 j9 g, P7 \# G, y9 s - Cater_Name = response.save& P0 T1 I3 ^0 D2 f( W
- for each in response.doc('.pic-list a[href^="http"]').items():
% h3 o, ~$ M3 L3 ?6 X6 [ - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)9 U q+ @8 r! M# Q: B
- ! S3 C8 `4 r ?2 W9 S5 t& o# r5 w
- def list_Caterg_detail(self, response):
+ f# Q( l3 V3 d5 K3 {9 r1 F - Cater_Name = response.save
+ G) b" d2 w; H+ F5 p - # print Cater_Name
3 K. }7 D! r Q( X# V( ~ - Bookname = response.doc('h1').text(). v, C6 I; i* v- w' D
- print Bookname, w( s7 D! G# R
- Book_author = response.doc('.authorname > a').text()( Y4 |* M, I! E# K0 l
- # print Book_author
2 l: _9 ~) Y; ` - Book_Introduction = response.doc('.book-intro > div').text()
/ K* ^* F; h5 {8 W+ W( f3 z - # print Book_Introduction
3 L- ?, T# j& g5 L+ o e+ s! G - Book_Synopsis = response.doc('b').eq(1).text()
6 X; q: ^# k5 L3 d; E' h - # print Book_Synopsis
% ?7 a* ]% D5 V# f - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
6 M. M& C7 B o) \; A" H - # print Book_Palabras
+ r4 u" W F( F. S o: I - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
$ w4 ^ N2 ~9 |2 x: C+ ~ \ - # print BookIDs
) P! |! k! F8 i K; x; } - Book_Dates = str(datetime.datetime.now()) $ V3 ], f5 z E2 `1 n7 X2 y8 v
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
- p8 C9 \! O) j! i7 i' @# G( R8 V - img = imgs.attr.src8 l6 x% x' c7 k2 F2 K+ \8 a4 T
- print img
0 G( x/ h$ [6 Q5 X9 Z" _$ m - #小说封面下载" c% n6 k$ ~4 P% K3 r) f
- extension = self.getExtension(img)6 ?/ j: }4 h$ b P- i c* K) N$ m0 E
- name = self.getname(img)
+ T3 B2 y$ D, z3 J0 G e: r1 s8 U% [ - file_name = name + "." + extension
5 X. W$ b P) p& u - imgDir = P_dir + name( Q+ r( o* y, e/ W/ P- e8 D5 \
- Locaimg = imgDir + "/" + file_name
8 p8 A. m9 y2 F; i( O - print Locaimg, ^. m3 t( d3 R
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地& }. X) A4 w3 c; S
- print('attachment url is ' + img) #! }# Y- q8 U6 t0 B) h9 c; ?
- Datos = {
1 I) @1 H& [8 |7 B% b& ]3 g - "Cater_Name":Cater_Name,% i$ ~, A; f! c3 A o$ y2 r H
- "Book_author":Book_author,0 D7 z3 @6 G$ ~& B! z/ m
- "Book_Introduction":Book_Introduction,
% `" [9 ?7 Q$ ^) @# w! \2 d1 ` - "Book_Synopsis":Book_Synopsis,) N7 Y6 W8 [9 G. k5 y, w
- "Book_Palabras":Book_Palabras,* y. w. ]& D8 M' N8 u
- "img":img,- P0 ^- R% F; `! u8 ~
- }- C1 H4 y9 S: M! N1 \; n' q
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布- N+ t: ]( b8 v9 C* z
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
" T9 W+ y; t2 u - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
3 w6 V" V \. v. ~2 V$ Y: H" P) L - ' a( k1 F/ s7 v5 y" ]6 G* f$ t" G
- @config(age=8 * 60 * 60)
1 f3 M7 x2 ` {& C& P4 [6 k! d - def index_page(self, response): ' H# v4 n# v) |1 Q; a( I. {
- Datos = { P7 H& P; r% Z- }
- "Cater_Name":response.save['Cater_Name'],6 Z3 n0 \; z* w
- "Book_author":response.save['Book_author'],$ K2 r: B! c; U5 ]$ q( E0 G, ] d
- "Book_Introduction":response.save['Book_Introduction'],
7 k- O+ @7 a Q* e+ v9 m4 }4 N8 c B - "Book_Synopsis":response.save['Book_Synopsis'],1 j! e. N2 e% o% {. C+ P# V
- "Book_Palabras":response.save['Book_Palabras'],% Y- ~+ n! Z$ `3 m# T+ h7 k1 Z% I
- "img":response.save['img'],
2 m$ Y5 t i8 p( R - }, K" E% m, r3 f7 w
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():4 U2 `. q0 f+ z* Y) o
- # for each in response.doc('.chapter-list a[href^="http"]').items():
+ W' ?/ e6 y5 h# Q( r - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
0 N) L& U! Z6 g8 q" C5 t$ D - @config(priority=2)' z' g3 p" o7 }3 o% O
- @catch_status_code_error" `9 Q8 d$ n- I _
- def detail_page(self, response):
$ o4 n5 w" f: m% ?$ M4 q* f0 l - NewRe1 = u'哈书'1 x$ _9 q8 ]! z9 ^) z: Y- w
- NewRe2 = u'huhjsd.CC'
9 X- \* a5 O+ b3 X/ X0 _' s) h) S - NewRe3 = r'^\\n\\n'
" s. O% Z/ m$ S1 E- F; B- q - NewRe5 = u'小说网') u. v6 Q( @* E( S. |
- NewRe6 = u'fgdfgf'
4 C T0 F: u2 w - NewRe7 = u'fgfgf'! `1 t6 @# A9 B! i* J( O4 S$ r" b5 ]* @
- NewRe8 = u'ffhgf'
# b0 v1 o4 a% i - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
2 C, m/ {3 M, D1 m- o - ReC1 = u'静思'6 V$ t" J# F: V* d
- ReC2 = u'aghgf.com'+ h$ S# Q+ e2 z1 h, w X0 b: a
- ReC3 = u'aghgfh.com'/ s( i/ \2 X( y6 }, R& A" z1 H7 E& o0 {
- ReC4 = u''
6 u( E3 I6 W0 }- [ - ReC5 = u'文学网'
* m# E$ U! |0 q. m. q' e - ReC6 = r'<BR>'6 Z, \) [; i; S% C1 f1 j
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
$ i8 Q+ k" c7 ^. y# P - print Bookname' q0 Q+ x6 x7 m* Y' S
- Cater_Name = response.save['Cater_Name'] # 小说分类
. @. w0 O0 b/ o1 B) P - Book_author = response.save['Book_author'] #小说作者- y) U1 B+ f1 Q, _8 W- m9 R* ?
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介/ N {. |+ t9 T: g% j% Y* b( }0 r
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新1 W: {& C! `5 D/ R# e W, u
- Book_Palabras = response.save['Book_Palabras'] #小说字数
, e1 l& j4 Q( b* g P - Bookurl = response.url #小说网址! S1 |: U& Q2 k1 Y+ g+ x
- Booktitle = response.doc('.article-title').text() #章节名称3 k! q! t+ V2 l$ I# z" D
- BookID = response.doc('.readset-r span').text() #小说ID
5 G0 P+ _8 ?7 j, ]+ G - BookConte1 = response.doc('.article-con').text() #小说章节内容/ `. a7 K; `% s/ }% |
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
& e! @( H3 B1 b/ N; [+ i+ C% \( Y - Book_Date = str(datetime.datetime.now()) # 采集时间. k5 K% A/ y5 c' ]$ O
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
: R) S3 G8 F# a3 w - BookConte3 = BookConte2.replace(NewRe2 , ReC2), e. B3 d0 w9 B9 `
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
7 M* j; x' c' w$ f6 b; u7 `8 G - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
) ?9 Z6 A( p: Y- H. Q9 X( a& D/ o( t - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
. z1 }- f% J/ Y i0 ?6 j - BookConte8 = BookConte7.replace(NewRe3 , ReC6), |$ e B+ g8 b! Z V) ^; M$ u7 }
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)1 M1 \# U" W' U* s
- BookConte = BookConte4.replace("\n\n","<br>")/ g4 p* R; Y7 \" a& {
- print BookConte9 p5 J% ]; s. m# m& g+ F8 |
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)' C, q, U7 \6 o% f' g9 m
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
% ^( D# O' ~* i, a9 S/ n' p - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)* ]3 F# ]) N3 w
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
+ H* v6 {0 ^3 J# l+ S - Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
2 s5 u* Q8 g7 U - Book_img = response.save['img'], #小说图片: Q0 k$ H0 [4 M3 \5 \
- ! N. L2 J# v; Z, Z* z& i
- #insert into MySQL 小说入库/ i! G' `$ Z6 W# `! a
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
( h, b8 \5 C( ]3 H6 { - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布8 s5 s. s7 N, J' Y1 M
- #post提交发布
1 \3 F. ]1 H( C4 b - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
G; {( i$ H! D% Y% X, a9 j( L - Datos = {* f% I8 I9 m9 G3 z) u
- "Cater_Name":response.save['Cater_Name'],
; y, U7 q) X/ M - "Book_author":response.save['Book_author'],: P& r2 d) [% B6 s
- "Book_Introduction":response.save['Book_Introduction'],
9 Y! ~ U8 C6 w; I7 F9 j7 ` - "Book_Synopsis":response.save['Book_Synopsis'],
( O$ Y* b c6 j. g( } - "Book_Palabras":response.save['Book_Palabras'],9 e4 A( P) p: g9 v- m
- "img":response.save['img'],
G% c: N K! K9 O- b# w0 } - }
; ]: Q D7 c/ n E; I2 i. R) J( w - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():* V- m- t2 N% F( a$ z1 u/ \1 D
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) + l4 Y% ]9 I; W G
- return {
; H* N$ t" B& r/ u6 F - "Cater_Name":Cater_Name,
. @6 p" @& R/ E/ e - "Bookname":Bookname,! s7 Y: [* S4 c5 Y1 V3 K& `# P" k
- "Book_author":Book_author,5 q! i& \/ {( ]% J
- "Book_Introduction":Book_Introduction,+ U7 Z6 Y( J+ O9 b
- "Book_Synopsis":Book_Synopsis,
! V0 N3 Z0 v6 u1 v - "Book_Palabras":Book_Palabras,
3 |/ b$ R3 f* ^5 _! [- Y - "Book_img":Book_img,; C2 Y% k) H4 ]
- "Bookurl": response.url,
9 K0 N1 J f7 E) }" r* W - "Booktitle": Booktitle,
+ o. {* [. d0 N7 ?8 P3 p' R - "BookID": BookID,
' D0 T! Y6 a, z$ M - "BookConte": BookConte,( s+ h1 X" z8 H
- "Titleid": Titleid,) P: @( I, E& ^! n4 a
- "abover":abover,: }, h* o% P2 t! T
- # "Book_Date" = str(datetime.datetime.now()),
* x- l7 k4 j: E1 n4 \8 B. G* n - }6 U2 g" l- W# @* l6 z4 \
- def download(self, P_dir, imgDir, file_name, Book_img):
! _! Y7 q1 f, C6 q - if not os.path.exists(imgDir): , S0 Y7 f2 J# Q" E# U. P$ x* h
- os.makedirs(imgDir)
% J) m8 X" i4 p0 w" {8 Z - file = imgDir + "/" + file_name
) ]7 H" W3 [7 ]- l - # print file
$ r! b5 ?/ X# l' P$ ], s - f = open(file, 'wb+')$ s2 B9 L# \$ U* v. t
- imag = requests.get(Book_img) - \) S H" f# j: f$ v( m! d1 W
- f.write(imag.content)# z) T9 H. F5 @( U$ K- F
- f.close()
0 H, x+ `; {% _' G9 U& v) d# [ - #保存图片前+ U( p& ?. Y5 Y N# ], M1 `
- def save_imgs(self,response):
9 ?/ ]) r3 G! { b* Z( H - content = response.content
/ r8 s% K# z/ L* d) V - file_name = response.save["file_name"]
% u+ S- ^* _" A% Q6 R - imgDir = response.save["imgDir"]
; @; ^3 i% Y5 j% ~7 H7 a/ f# R - file_path = imgDir + file_name
$ X7 r* u+ z+ ]* S( {5 C" c - self.save_img(content,imgDir,file_path)
; P6 S: L# P: S3 z - #保存图片8 F1 _1 K, ?0 i
- def save_img(self,content,imgDir,path):- w. u8 [0 s3 a/ O/ Q6 N _
- if not os.path.exists(imgDir):
* e; L8 k/ a4 K - os.makedirs(imgDir)/ k4 B" X" R5 z( U/ o( q: j8 }" X
- f = open(path,"wb" )/ f$ X+ G* m, a2 `6 W' J
- f.write(content)# o! n: _# D2 O: M8 J0 H
- f.close()
) [6 J. m& D$ j: }$ M - #获取url后缀名! m$ P5 g* ^( A/ d4 D2 ]
- def getExtension(self,url): ' l, h# Q+ p/ L. r1 C, e( f8 G; b0 a
- extension = url.split(".")[-1], J/ E8 R1 B9 T: E
- return extension
- @1 e% b% M9 d# ` - & [3 Q* l' _) t) }5 O F, A$ R
- #获取图片名# O; W5 U: ?! L7 _# g+ c$ I
- def getname(self,url):
( P x' X! G, ~. s - name=url.split("/")[-1].split(".")[0]7 P5 C# G" U8 t9 o4 B) V, Z
- return name
复制代码
- \0 _4 F, i0 O _. D7 n' w# _- X; L- Z
|