●10-1 获得一个HTML页面的通用代码
import requests
def getHTMLText():
try:
r = requests.get(url, timeout=30)
r.raise_for_status() #如果状态不是200,引发异常
r.encoding = 'utf-8' #无论原来用什么编码,都改成utf-8
return r.text
except:
return ""
url = "http://www.baidu.com"
print(getHTMLText(url))
●10-2 中国大学排名爬虫
#e23.1CrawUnivRanking.py
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fillUnivList(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd)==0:
continue
singleUniv = []
for td in ltd:
singleUniv.append(td.string)
allUniv.append(singleUniv)
def printUnivList(num):
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名","学校名称","省市","总分","培养规模"))
for i in range(num):
u=allUniv[i]
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0],u[1],u[2],u[3],u[6]))
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(10)
main()
●10-3 百度关键词自动提交
#e24.1AutoKeywordSearch.py
import requests
from bs4 import BeautifulSoup
import re
import json
def getKeywordResult(keyword):
url = 'http://www.baidu.com/s?wd='+keyword
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def parserLinks(html):
soup = BeautifulSoup(html, "html.parser")
links = []
for div in soup.find_all('div', {'data-tools': re.compile('title')}):
data = div.attrs['data-tools'] #获得属性值
d = json.loads(data) #将属性值转换成字典
links.append(d['title']) #将返回链接的题目返回
return links
def main():
html = getKeywordResult('Python语言程序设计基础(第2版)')
ls = parserLinks(html)
count = 1
for i in ls:
print("[{:^3}]{}".format(count, i))
count += 1
main()