python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)

发布时间:2017-09-13 11:47:05
python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)

废话不多说,直接贴代码,站群软件,主要采用BeautifulSoup写的

# -*- coding: utf-8 -*-

"""

Created on Mon May 18 19:12:06 2015

@author: Administrator

"""

import urllib

import os

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

i = 0

j = 0

list_a = []

def gettext(href):

global j,list_a

page = urllib.urlopen(href).read()

soup = BeautifulSoup(page,from_encoding="gb18030")

div = soup.find_all("div",class_="content")

p_text = div[0].find_all("p")

for p in p_text:

fp = file("%s.txt" % list_a[j],"a")

fp.write(' ')

fp.write(p.get_text())

fp.write(" \n")

j+=1

def gethref(url): #获得所有链接

global i,list_a

fp = file("AllTitle.txt","w+")

page = urllib.urlopen(url).read()

soup = BeautifulSoup(page,from_encoding="gb18030")

ul = soup.find_all("ul",class_="row1")

li = ul[0].find_all("li")

for lia in li:

list_a.append(("%s、" % (i+1))+lia.h3.get_text())

href = lia.a.get('href')

# 将标题简介和链接有规则的写入文件中

fp.write("%s、" % (i+1))

i+=1

fp.write("标题:")

fp.write(lia.h3.get_text())

fp.write("\n 简介:")

fp.write(lia.p.get_text())

fp.write("\n 链接:")

fp.write(lia.a.get("href"))

fp.write("\n")

gettext(href)

if "__main__"==__name__:

url ="http://re.qq.com/biznext/zkht.htm"

gethref(url)

print "All Is OK!"


企业建站2800元起,携手武汉肥猫科技,做一个有见地的颜值派!更多优惠请戳:武汉做网站 https://www.feimao666.com