Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!" L3 @3 S1 k% w& x: E Q E2 {% \
- #!/usr/bin/env python
: H2 y1 _" T e ] - # -*- encoding: utf-8 -*-
( l( D! b& E# V; }' b* U' ~ - # Created on 2019-05-05 21:43:11. A1 c: R3 q& n
- # Project: XiaoShuo
7 t& k# h7 N# _ u! J -
5 R6 t" s" O; S4 o2 V" `" E4 |3 X- U - from pyspider.libs.base_handler import *3 }0 h5 q- T( \$ D
- import pymysql6 J$ @. L( C# `/ ?( w, y
- import random$ {3 v \* V/ z9 [" ?
- import datetime
- V& m4 |+ ^7 o3 B0 g% O - import urllib2,HTMLParser,re+ g. w3 c, T2 P8 B- Y3 @8 ?) R5 r
- import os/ K, V8 h5 P0 X( e% u8 i
- import sys
! w7 A$ f ?3 s4 N - import re
1 _! K7 D& s+ P' h - import codecs6 O4 G [ z2 N p" x
- import requests
3 j" _3 U! W j6 n6 D - import json- u3 o. i0 E6 b; `) J# V9 m+ z- w
-
4 B, ]6 Z7 |/ P6 n! k - class Handler(BaseHandler):8 c. c% }( D3 R/ j
- global Datos
4 ~, R5 _! A8 U1 q - global P_dir
- e) {, {- A j9 h4 E& C - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径5 y/ i9 R/ n6 L- ^" X5 D- N
- global Datos4 L) g5 K8 l2 u3 r# X$ a% c
- Datos = {}
8 p, y7 q3 t B2 R* V+ R$ w- R - headers= {3 j4 l, B- a1 C+ }' L" b! w2 f6 ?* j* t
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',% E1 b" a! E5 s. k
- 'Accept-Encoding':'gzip, deflate, sdch',9 }# n5 M+ I+ T; E$ m o
- 'Accept-Language':'zh-CN,zh;q=0.8',
* A+ I+ R3 d) X( v5 l5 N - 'Cache-Control':'max-age=0',% @( Z$ T9 x4 n2 d. q- W9 k) X/ Z& F0 q
- 'Connection':'keep-alive',% g& _: l8 ^9 o. p
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
- b p; H$ g" V m. |& ]# l - }$ v! q4 |! K b6 t4 P* k
- crawl_config = {
+ B6 C w4 F- h0 S+ g- o. W4 P - 'headers' : headers,
( T* |" E; U: V* M1 j - 'timeout' : 300
: I9 } V' o: @4 Y1 n& O - }7 g1 r- G0 e. Q# V
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):8 i1 B$ }; V+ ?# N6 @# r
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")& e, ~% D& D5 ]7 \0 `' e
- try:
6 A) i7 ]3 _( u9 L$ K+ Q, T - cursor = db.cursor()* W$ g& B# D+ o' k) O, f
- #注意此处字符串的占位符要加双引号"%s"
1 I( o4 _. Z: _ - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);# i- Q4 }+ R6 ?& O8 t7 B
- # print(sql)
3 }* W8 @: ^/ _" o - cursor.execute(sql)
- P$ K8 `- ~( h - , X7 A6 ?% R& `4 m2 F
- #qid = cursor.lastrowid3 W) ~. z; S7 d! I" [
- #print(qid)5 R; O1 \, G9 U2 Z. u) U1 M& i
-
+ w" ~5 {/ J- F& y7 L- v: l - db.commit()& i. f: q' j4 n/ T1 F( O& R
- except Exception as err:
4 O2 d6 I4 t6 L+ m2 V% |2 i1 T - print("Error %s for execute sql: %s" % (err, sql))$ l0 d# T; K5 m+ C: Z
- db.rollback()* \/ m7 t1 ?) V% A
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
, N% a/ y2 W1 G7 u - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
/ p8 n. |) `7 |& S3 A3 O- u0 L - try:
3 }3 d' e8 m. M0 y% H - cursor = db.cursor()
9 c u$ a$ r" Y' P; u - #注意此处字符串的占位符要加双引号"%s"
/ U. f# d' I4 Z) X( z - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
" t2 J+ X- T/ ?! g/ n - # print(sql)
$ |3 Y9 W7 S9 X0 o5 m2 a - cursor.execute(sql)
% m3 p& ]6 n* W- R* P5 \ - 2 l- o0 q& R* }8 s
- #qid = cursor.lastrowid! J2 d }& g- j/ `
- #print(qid)
, s( _% I$ F5 j. V4 M -
' N- i3 y7 P5 B9 |& N - db.commit()3 m0 `& L, I+ q
- except Exception as err:
1 S# b" ?+ ~. |+ [0 a! n1 H9 n - print("Error %s for execute sql: %s" % (err, sql))7 ]7 q) @6 k0 ^
- db.rollback()
/ e0 M6 h- ]' [. U& ~( R) n' u' q - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):+ U5 ~' e- P/ Y( [- I
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& S0 t! V( n) G2 ^+ ]% W - try:
! Z3 m$ i: G# C7 X0 |1 p& k; { - cursor = db.cursor()- o5 u8 h$ `( V
- #注意此处字符串的占位符要加双引号"%s"
' l8 K: w2 a. q" @$ k - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);. Q2 D2 v# ^+ Z" T. H* n
- print(sql)
3 L; k2 v$ Q' m- X, p2 O - cursor.execute(sql)
& _. [1 T& s% j7 R3 F& I, u! N7 V4 M - print(cursor.lastrowid)
( m! j: ]. m9 }2 f B - db.commit()
1 a5 I; Z2 v3 Y5 M8 b4 d - except Exception as err:
' e0 B' _2 v1 g9 x - # except:4 M! o6 V4 f6 y& D5 b+ I. Y6 k
- # print('Failed')
, V! k' U0 Z' [$ P9 o* b. h& l* \ - print("Error %s for execute sql: %s" % (err, sql))
- x" F. [: v: `: m - db.rollback()
, Y) f) a5 a: x- h$ S& m% S - ; |: [; q+ \0 Y4 y& B+ F; x1 Z
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
" I$ B- Y+ o& p* v - reload(sys)! f( p3 r/ m6 b( S7 Y/ ?" H6 P
- sys.setdefaultencoding("gbk")9 E8 P) S+ T9 ^# r9 T/ I
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址. l+ v0 g& p# l a; ^
- locoy_data = {8 U4 L, e$ A) ?5 t" i
- 'my_u':'用户名', #后台用户名
7 ]# c1 X; M/ e3 t' s - 'my_p':'密码', #后台密码
. a8 Q3 E9 Y S$ ^/ r- l - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),! t, [! ~/ q! {/ f7 x3 Q% f
- 'caid':Cater_Name.encode('gbk', 'ignore'),; B" b' V! \ Q1 L6 z) x2 D
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),6 q4 s/ X4 B1 l* y* R( V4 j
- 'article':BookConte.encode('gbk', 'ignore'),9 d& V( D9 w" Y6 w
- 'author':Book_author.encode('gbk', 'ignore')," e* E9 B# \0 ^+ K: g8 [
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),: B* E2 ~5 _; d6 {
- 'thumb':Book_img,
! ?3 l7 k' T8 [0 \+ W# r4 p' O - 'content':Book_Introduction.encode('gbk', 'ignore'),
8 ]1 y. r- g* N: j - 'abover':abover.encode('gbk', 'ignore') 0 x# ~ {% }1 j3 D" L# I% f
- }
7 d: o* K* o* ~# x7 R" L% E6 t - res = requests.post(locoy_url, data=locoy_data)
% r6 N' i. ^2 O/ P" L - print res.text
" E) I) ?: A1 X - print res.content0 x! I* O. D5 ^) t# @: X7 S
- # print Dsd4 w) B8 S7 g$ o
- return res
- e: J% I1 }$ i2 h, F* K5 p7 n2 { -
# ?0 t9 K' |& M" T( v F. }2 _8 C - def __init__(self):, S; ~5 `# ^- ^3 z' D: k% n* g
- self.base_url1 = 'https://www.****.cc/'
4 w8 `. V2 s, g' e8 [& Q - self.base_url2 = '/'
8 l K$ u5 q. F* M! d& A - self.CaterId = []
' D/ l; ~( K, ~( n9 L - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']1 A. e2 r5 t' d! i- _1 b. Z4 O
- self.page_num = 1
3 Q- D2 s( V1 A" N - self.total_num = 200
7 h" p3 {1 v" D6 D$ D! T2 G -
) `% J) t$ A0 D, l - @every(minutes=8 * 60)5 `+ m8 z; D5 s% j% ^2 b" b
- def on_start(self):$ y+ d4 z: I& f$ J
- global Cater_Name, Q6 @( J4 B, @' D* `( J5 p+ O+ Z5 p
- Cater_Name = []' b6 |; |# V2 o; h1 i
- while self.page_num <= self.total_num: 7 _2 N1 u( X1 h9 Z* N$ O; ~
- for self.CaterId in self.CaterIds:/ ^$ J m2 Q7 \/ b* G9 Z( i
- if self.CaterId == 'xuanhuan':
) X1 ~# `) I/ ] - Cater_Name = '玄幻', S/ @, g0 V) M+ D& J5 e% [! i
- if self.CaterId == 'wuxia':
# x8 X: k! l' C8 h+ F - Cater_Name = '武侠'
( ?$ Q5 m3 w/ y3 O6 v. H7 } - if self.CaterId == 'lishi':
- @% ]. b# h/ H" T5 ~ - Cater_Name = '历史' " Q( v5 }/ r S/ y& ]! @1 w
- if self.CaterId == 'yanqing':- k$ o8 k+ a8 {
- Cater_Name = '都市' 0 D+ i' u P0 b
- if self.CaterId == 'nvsheng':5 K* d. m9 v; \7 y, F
- Cater_Name = '都市'
) C6 z4 I2 q# n. O! y - if self.CaterId == 'kehuan':0 v* Q* t8 h- O- }" O1 T$ h
- Cater_Name = '科幻' - s6 m' s; e" D: T- c7 @
- if self.CaterId == 'kongbu':
o" W$ ?/ g2 h9 |. g! u0 \: @) i - Cater_Name = '游戏' 1 e$ A8 J0 Z. O) i K* G0 d
- print self.CaterId# a! P+ D7 ~- N4 z
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" ) w" c. y: p1 k1 C
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
9 {2 B+ E) h/ ^7 Y* X/ G& ? - self.page_num += 1 ; I2 k3 K7 k/ }
- * T9 I, T; ?) W/ S1 F5 s/ q7 e
- def list_Caterg(self, response):: I' B, g) ]& |4 c- r2 E! X
- Cater_Name = response.save
# f+ I$ ?, N) | - for each in response.doc('.pic-list a[href^="http"]').items():
7 f' Z! z( \0 z; j$ Y+ O. E) M, T$ | - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)3 ]5 c( ~/ ~7 ?8 g, \# i6 a
-
" Q- b/ h6 V3 m8 }' |# h: h - def list_Caterg_detail(self, response):
, `1 a+ s. K9 G F" s! S+ o/ f - Cater_Name = response.save
. C9 ?; h% m6 R/ U( W# I - # print Cater_Name: H2 D+ E. f- O1 \' J+ z3 [
- Bookname = response.doc('h1').text()
, Z2 j4 ]3 v1 ] \1 O2 P8 L+ f' ^* G- u - print Bookname
z) J0 u0 a! I( i8 Z* M - Book_author = response.doc('.authorname > a').text()7 i# G' D$ j m9 ]' i8 _
- # print Book_author
- h1 r7 X7 x0 p) Y5 y - Book_Introduction = response.doc('.book-intro > div').text()- I7 _! M+ V' e
- # print Book_Introduction0 J+ z2 {" s* S* a: H; S
- Book_Synopsis = response.doc('b').eq(1).text()
3 {" p# o+ G* i) X! [ - # print Book_Synopsis
# J. [2 z( V: y7 y) _1 M# C - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
; ~7 B, J9 N5 w2 I$ i" r* \5 @% j - # print Book_Palabras( D2 e8 i0 ?) S% z1 \
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID# w9 y4 i7 Y, Z) n! k0 g
- # print BookIDs
0 L" ], e' i) ^2 ?/ q - Book_Dates = str(datetime.datetime.now()) # c [/ {& K+ r
- for imgs in response.doc('.bigpic > img[src^="http"]').items():% R( h6 q$ _2 c+ h' b) J0 E; f" \
- img = imgs.attr.src
! Q. d8 J6 w& ]2 m0 v5 }7 l% p - print img
1 D' V0 N2 e- n6 L% W0 s8 Q - #小说封面下载# w2 [1 W/ r; d4 r9 r0 c
- extension = self.getExtension(img)" }' n, Q( A" g5 f7 t/ `
- name = self.getname(img)
6 k* p' N/ ]6 Z5 u. x - file_name = name + "." + extension6 @$ z0 S+ @' }0 L
- imgDir = P_dir + name
) G, |: r" |: P( ~; v; b - Locaimg = imgDir + "/" + file_name/ Y, h3 L' {/ J' X( z& y3 E; j/ K
- print Locaimg9 U H& W8 Z) X' C5 c
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
k* q( t/ b. M+ Q& l! M - print('attachment url is ' + img) #) z4 Q/ Y; A1 {5 }
- Datos = {
! Z+ M$ F' w- X/ e% o$ @8 g - "Cater_Name":Cater_Name,& `( T- e8 `! x# ~, o
- "Book_author":Book_author,( F3 V) Z3 g1 `, x# A. k7 X; Z
- "Book_Introduction":Book_Introduction,
6 b0 x# a% Z) b' d$ K9 K c. u - "Book_Synopsis":Book_Synopsis,
1 q5 h ?; z; J O - "Book_Palabras":Book_Palabras,- U1 I/ n( V' d" x7 i* W5 {
- "img":img,/ j' R! n, ^% ~' ^. P$ g5 } j
- }
, G% A) h- e$ b5 X5 V9 V' l - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
5 z u* N* O) ]# @. z - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
1 n$ A2 |2 `, ] v - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
+ E; t8 [7 U- L2 {4 d: T - 3 W L/ Z% P+ E# u2 E. J
- @config(age=8 * 60 * 60)
* e% `3 v4 B! e - def index_page(self, response): ' V7 c/ S% ^* v! `7 \5 m
- Datos = {4 N; ~2 t \9 P( }
- "Cater_Name":response.save['Cater_Name'], a5 t1 O! G B) S0 u7 V) [8 Q
- "Book_author":response.save['Book_author'],
6 |! M( j( Z9 K# h) C - "Book_Introduction":response.save['Book_Introduction'],
/ ]0 q7 ^, }* F" H - "Book_Synopsis":response.save['Book_Synopsis'],
2 F3 s! |* X; y6 T$ Z7 F$ p. r( Y - "Book_Palabras":response.save['Book_Palabras'],3 a c. P" x" H5 T* i4 h0 v2 e8 `" o
- "img":response.save['img'],
) }' m! q; s5 Y1 r/ [" N2 A - }
! e: F) j6 U( k: \1 B. _- D - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
5 P1 j4 H4 c4 y) p. u+ z# c - # for each in response.doc('.chapter-list a[href^="http"]').items():
% M# J$ L9 f# m4 ^. B/ D; C$ Z - self.crawl(each.attr.href, callback=self.detail_page,save=Datos), R8 ~4 U0 S& c1 T* ~2 w
- @config(priority=2)/ m6 ?* Q# u, d. G: Y0 n" `
- @catch_status_code_error# H: H% W+ U9 _, Q: o7 A
- def detail_page(self, response):
& @1 h; d; D) O( V6 Y - NewRe1 = u'哈书'
( c) C) G5 l. i - NewRe2 = u'huhjsd.CC'. F/ i. A( B% Z7 l. S8 ~& ` W
- NewRe3 = r'^\\n\\n'
. n* ?* K% s ~ - NewRe5 = u'小说网') p4 E8 E3 n3 I3 { }: N
- NewRe6 = u'fgdfgf'
! \% K8 H, V0 Y7 [9 u" _ - NewRe7 = u'fgfgf'
/ S! W/ G& C3 l8 j5 ^: \3 \5 G - NewRe8 = u'ffhgf'
; q3 k) c! d0 _2 L. |1 C6 p( k - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'3 `9 B1 K3 G+ ^( I, G6 V
- ReC1 = u'静思'' E1 H/ U9 `4 ^% g+ B( s
- ReC2 = u'aghgf.com'5 e8 V6 q1 [; |
- ReC3 = u'aghgfh.com'
5 U6 a" z. l( \+ I1 } - ReC4 = u''1 v$ [% g u, K$ y4 |
- ReC5 = u'文学网'" z6 {4 {' ]( x! ?& s+ f
- ReC6 = r'<BR>'' h6 v+ Z! }3 ]' T; j6 A" E
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称* c6 @! {; o7 r! z; { v
- print Bookname
9 G& P. \9 e- ` - Cater_Name = response.save['Cater_Name'] # 小说分类
; x6 j+ J: A* n+ w1 @$ e - Book_author = response.save['Book_author'] #小说作者
/ @: B7 {* a& |0 w/ {* U7 O8 _ - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
$ w" Q/ b. B" S* s. Q; ^& A - Book_Synopsis = response.save['Book_Synopsis'] #最近更新0 z5 i/ ~- @/ x. k
- Book_Palabras = response.save['Book_Palabras'] #小说字数
8 U% }) T2 y- e: ?% r# v - Bookurl = response.url #小说网址
; y% E4 e, `# }/ p3 t* M - Booktitle = response.doc('.article-title').text() #章节名称) A! X a+ c- S5 Q, Y" L
- BookID = response.doc('.readset-r span').text() #小说ID
& y2 T4 j A# z; I5 W - BookConte1 = response.doc('.article-con').text() #小说章节内容
- d& [8 }, q5 V; l' R5 ]+ o, w - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
. {1 E/ P$ G) a- d( _ - Book_Date = str(datetime.datetime.now()) # 采集时间
: J% P7 H8 e& z/ { - BookConte2 = BookConte1.replace(NewRe1 , ReC1)" f2 T/ V6 ?, U9 u: k6 d' ~3 _6 x
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
. ?- z! Y1 `; e5 q - BookConte5 = BookConte3.replace(NewRe5 , ReC5)9 ]7 P5 p9 J5 u1 c# Z! ?7 \
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
* H1 D* ~9 ]$ R1 ~$ D' g7 s4 K N - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
. X1 B5 c- R9 Z - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
* A- {7 U8 e$ ?7 @/ r' w9 O - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
( {; \, {, I5 ?$ k; Q. P1 t( l - BookConte = BookConte4.replace("\n\n","<br>")
" g% |2 b0 y4 F - print BookConte
' [ f E) W; Q3 ~: `0 P$ W - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)* L9 m" J: r w [
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)9 i7 _4 {. O2 x+ k
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
0 M& Y( g ]# F O - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4). D1 o3 M7 [) D* w+ y: U1 l
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 5 R/ S+ J! C5 J7 N" }" g0 y6 g9 Z
- Book_img = response.save['img'], #小说图片' h) t6 k' k1 y5 n0 P9 r/ q# N
- * o- u& ]2 ]# G8 [; N& G
- #insert into MySQL 小说入库
/ o2 u9 u! B! \) F - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布' Y5 V a6 K$ p5 w, L1 J2 c: O$ Z
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布 _9 U3 L. Q* L* P& n. T4 K% e) M
- #post提交发布
. t* ^5 o# w# _+ F9 v* b - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
6 t# G$ V. X& {2 C - Datos = {- o; ^5 j/ ?7 N! _9 t
- "Cater_Name":response.save['Cater_Name'],
: m, s: g5 f6 w. u - "Book_author":response.save['Book_author'],
5 n: m+ D8 {# K; Q, } - "Book_Introduction":response.save['Book_Introduction'],
0 |8 Q# g- i6 Y: @1 ?! x7 n - "Book_Synopsis":response.save['Book_Synopsis'],
J& P& S& ~4 G1 M+ k - "Book_Palabras":response.save['Book_Palabras'],
4 W& ^4 q# ?* h, a - "img":response.save['img'],
U) Y1 o' a! v - }
3 |3 ?" N* W3 k# w+ F) [, o8 a - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
% x, t3 B2 p# d+ V5 P: x - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
' L& q9 u4 w+ A/ n - return {* Q. ~1 u: R3 B& u l" m
- "Cater_Name":Cater_Name,; u# Y9 w1 W7 R0 a/ n0 u
- "Bookname":Bookname,. L+ f: g8 v( V& o3 V. R% w, `
- "Book_author":Book_author,
4 I1 f8 C, b9 z! t2 ?: O - "Book_Introduction":Book_Introduction,
' y0 o; C: O! z( S$ R: r - "Book_Synopsis":Book_Synopsis,: a0 \- M, r* u k" [8 M4 M
- "Book_Palabras":Book_Palabras,5 ?& |6 Z& `; ^* Q$ D
- "Book_img":Book_img,
( ^4 L; v# X3 D+ u. m - "Bookurl": response.url,
# \* \+ K% _+ T - "Booktitle": Booktitle,$ _7 ?+ k _$ Z/ `- k9 A
- "BookID": BookID,
5 H( H* H2 T5 p/ Q# s4 C- k - "BookConte": BookConte,' z* R$ ?2 j. K) u8 k! s
- "Titleid": Titleid,
$ x) r: Q; v& x. l+ }0 S. ^ - "abover":abover,
i ^9 K j9 [* w - # "Book_Date" = str(datetime.datetime.now()),+ q* R8 `* j/ i! _* K/ Q/ Z
- }
+ a k- j( d" _% G7 f& k1 h - def download(self, P_dir, imgDir, file_name, Book_img):
* t1 Y7 z/ }- _ - if not os.path.exists(imgDir):
5 u; L: P% b9 h - os.makedirs(imgDir)
# B; y/ q0 w4 j) N0 z8 y/ v - file = imgDir + "/" + file_name
- o. l3 a( m; B- l - # print file
; V5 B6 V1 W" i. l' y: ~0 E - f = open(file, 'wb+')
5 U7 y6 D& D1 K - imag = requests.get(Book_img) 7 \- c! e* Q- h: x, ?$ B
- f.write(imag.content)
, C9 m7 y% e9 i' U5 q" o' c - f.close()
' ~! l& j5 a6 P! Y4 ~- e/ U - #保存图片前
6 N* x3 S2 @* o8 S9 g5 E8 R - def save_imgs(self,response):
" V6 z6 Z/ a) N1 N0 n& |2 m - content = response.content
V; n' T+ V# U% I - file_name = response.save["file_name"]- P/ g3 O* m& [) s/ m- T/ ]% \8 b
- imgDir = response.save["imgDir"]& k$ O& C8 w6 l$ ?# n# C
- file_path = imgDir + file_name$ e+ @& B: }; o) I
- self.save_img(content,imgDir,file_path)/ Y* k) d3 f( w; T% o0 u6 r
- #保存图片
! S6 d5 E) I2 ?- T - def save_img(self,content,imgDir,path):
$ w0 `2 N; b8 ^7 v - if not os.path.exists(imgDir):
0 Z7 z" Q1 y+ h$ j' q2 `; Y/ P - os.makedirs(imgDir); O! i) _& }6 G) ~: x0 X7 v
- f = open(path,"wb" )
' |5 [5 j& S5 h# ], { - f.write(content)5 B+ T) _' {$ a$ p2 p7 S
- f.close()
# A% f. X b" u% W- h; c - #获取url后缀名+ J3 L2 ~- T1 A
- def getExtension(self,url):
8 l- \& E+ G: r' Y, p( \ - extension = url.split(".")[-1]
" i2 v/ L4 V* J7 ^8 ~9 { - return extension
, H& p; T+ w/ }$ ~( P% W - 5 r) X/ P8 J. X/ L
- #获取图片名/ `% @( x/ r4 I
- def getname(self,url):4 Z9 K# X" e9 s
- name=url.split("/")[-1].split(".")[0]2 J# d( }0 J+ m' x3 b, t6 }
- return name
复制代码 1 G1 Q) @6 U( p; ^" S
) }. a: k! y! v9 }8 I' u" {
|