python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)

废话不多说,直接贴代码,,主要采用BeautifulSoup写的

# -*- coding: utf-8 -*-

"""

Created on Mon May 18 19:12:06 2015

@author: Administrator

"""

import urllib

import os

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

i = 0

j = 0

list_a = []

def gettext(href):

global j,list_a

page = urllib.urlopen(href).read()

soup = BeautifulSoup(page,from_encoding="gb18030")

div = soup.find_all("div",class_="content")

p_text = div[0].find_all("p")

for p in p_text:

fp = file("%s.txt" % list_a[j],"a")

fp.write(‘ ‘)

fp.write(p.get_text())

fp.write(" \n")

j+=1

def gethref(url): #获得所有链接

global i,list_a

fp = file("AllTitle.txt","w+")

page = urllib.urlopen(url).read()

soup = BeautifulSoup(page,from_encoding="gb18030")

ul = soup.find_all("ul",class_="row1")

li = ul[0].find_all("li")

for lia in li:

list_a.append(("%s、" % (i+1))+lia.h3.get_text())

href = lia.a.get(‘href’)

# 将标题简介和链接有规则的写入文件中

fp.write("%s、" % (i+1))

i+=1

fp.write("标题:")

fp.write(lia.h3.get_text())

fp.write("\n 简介:")

fp.write(lia.p.get_text())

fp.write("\n 链接:")

fp.write(lia.a.get("href"))

fp.write("\n")

gettext(href)

if "__main__"==__name__:

url =""

gethref(url)

print "All Is OK!"

坚韧是成功的一大要素,只要在门上敲得够久够大声,终会把人唤醒的。

python 爬虫爬取腾讯新闻科技类的企鹅智酷系列(1)

相关文章:

你感兴趣的文章:

标签云: