start

逻辑一样，采用正则的 findall 方法提取

luck = re.findall('<p class="txt">(.*?)</p><ul',
                      response.text,
                      re.S)[0]

由于我们前面看到了，每一个星座页面返回符合的标签有两条有一条多的，那我们正则就可以直接剔除了，小编在正则前后加了上下一个标签的开头，re.S 是允许正则表达式换行匹配，我们的 html 页面前面看到了是叠在一块的，没有格式化，所以需要它

end

import requests
import time
import re
from fake_useragent import UserAgent


def get_html(url):
    '''
    请求 html
    :param url:
    :return: 成功返回 html，否则返回 None
    '''
    count = 0 # 用来计数
    while True:
        headers = {
            'User-agent' : UserAgent().random
        }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            return response
        else:
            count += 1
            if count == 3: # 超过 3 次请求失败则跳过
                return
            else:
                continue


def get_infos(response):
    '''
    提取信息
    :param response:
    :return:
    '''
    luck = re.findall('</p><p class="txt">(.*?)</p><ul',
                      response.text,
                      re.S)[0]
    return luck


def write_txt(_,info):
    '''
    写入 txt 文件
    :param _: 星座名
    :param info: 星座运势
    :return:
    '''
    with open('luck.txt','a+',encoding='utf-8') as f:
        info = info.strip()
        f.write(_ + '\n')
        f.write(info + '\n\n')


if __name__ == '__main__':
    # 所有 12 星座的名称, 并构造 urls
    constellation_name = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo',
                          'Virgo', 'Libra', 'Scorpio', 'Sagittarius',
                          'Capricorn', 'Aquarius', 'Pisces']
    for _ in constellation_name:
        url = 'https://www.d1xz.net/astro/{}/'.format(_)
        response = get_html(url)
        if response == None:
            continue
        info = get_infos(response)
        write_txt(_,info)
        time.sleep(1)

Python3Turtle

木下瞳的爬虫专栏

start

end