start
逻辑一样,采用正则的 findall 方法提取
luck = re.findall('<p class="txt">(.*?)</p><ul',
response.text,
re.S)[0]
由于我们前面看到了,每一个星座页面返回符合的标签有两条有一条多的,那我们正则就可以直接剔除了,小编在正则前后加了上下一个标签的开头,re.S 是允许正则表达式换行匹配,我们的 html 页面前面看到了是叠在一块的,没有格式化,所以需要它
end
import requests
import time
import re
from fake_useragent import UserAgent
def get_html(url):
'''
请求 html
:param url:
:return: 成功返回 html,否则返回 None
'''
count = 0 # 用来计数
while True:
headers = {
'User-agent' : UserAgent().random
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
response.encoding = 'utf-8'
return response
else:
count += 1
if count == 3: # 超过 3 次请求失败则跳过
return
else:
continue
def get_infos(response):
'''
提取信息
:param response:
:return:
'''
luck = re.findall('</p><p class="txt">(.*?)</p><ul',
response.text,
re.S)[0]
return luck
def write_txt(_,info):
'''
写入 txt 文件
:param _: 星座名
:param info: 星座运势
:return:
'''
with open('luck.txt','a+',encoding='utf-8') as f:
info = info.strip()
f.write(_ + '\n')
f.write(info + '\n\n')
if __name__ == '__main__':
# 所有 12 星座的名称, 并构造 urls
constellation_name = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo',
'Virgo', 'Libra', 'Scorpio', 'Sagittarius',
'Capricorn', 'Aquarius', 'Pisces']
for _ in constellation_name:
url = 'https://www.d1xz.net/astro/{}/'.format(_)
response = get_html(url)
if response == None:
continue
info = get_infos(response)
write_txt(_,info)
time.sleep(1)