CSS选择器

一、CSS选择器

二、CSS选择器实例

按照class属性值取出网页信息

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
teacher_info=sel.css('.teacher_info').extract()
print(teacher_info)

 输出结果:输出class为teacher_info的所有html元素

 按照id值取出html网页元素

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
info_tag=sel.css('#info').extract()
print(info_tag)

 输出结果:

 选取对应class属性值下的对应元素

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
age_tag=sel.css(".teacher_info > p").extract()[0]
print(age_tag)

 输出结果: 

 选取输出对应class属性值下的对应元素的值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
age_tag_value=sel.css(".teacher_info > p::text").extract()[0]
print(age_tag_value)

 输出结果:

 输出对应class属性的指定第n个孩子节点的值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
name_tag=".teacher_info p:nth_child(2)::text"
name_tag_value1=sel.css(".teacher_info > p:nth_child(2)::text").extract()[0]
print(name_tag_value1)
name_tag_value2=sel.css(".teacher_info p:nth_child(2)::text").extract()[0]
print(name_tag_value2)
name_tag_value3=sel.css(name_tag).extract()[0]
print(name_tag_value3)

 输出结果:

 输出对应class属性后面第一个对应P属性的值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
             </tr>
    </tbody>
  </table> </div> </body> </html> """ #先取出所有的html值 sel=Selector(text=html) course_info1=sel.css(".teacher_info + p::text").extract()[0] print(course_info1)
 

输出结果:

 输出所有与class属性相邻P元素的值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
course_info2=sel.css(".teacher_info ~ p::text").extract()[0]
print(course_info2)

 输出结果:

 输出指定超链接的标签属性值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
couse_url1=sel.css("a[href='https://coding.imooc.com/class/200.html']::text").extract()[0]
print(couse_url1) 

 输出结果:

 获取指定超链接包含某字符串的所有标签属性值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
course_url2=sel.css("a[href*='imooc']::text").extract()
print(course_url2)

 输出结果:

 向后获取同级标签属性的值

from scrapy import Selector

html="""
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="jquery-3.5.1.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info">
            Python全栈工程师
            <p class="age">年龄:29</p>
            <p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
            <p class="work_years">工作年限:7年</p>
            <p class="position">职位:python开发工程师</p>
        </div>
        <p style="color:aquamarine">课程信息</p>
        <table class="courses">
            <tbody><tr><th>课程名称</th>
            <th>讲师</th>
            <th>地址</th>
        </tr><tr>
                <td>django打造在线教育</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
            </tr><tr>
                <td>python高级编程</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
            </tr><tr>
                <td>scrapy分布式爬虫</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
            </tr><tr>
                <td>diango rest framework打造生鲜电商</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
            </tr><tr>
                <td>tornado从入门到精通</td>
                <td>bobby</td>
                <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
            </tr></tbody></table>

</div>
</body>
</html>
"""
#先取出所有的html值
sel=Selector(text=html)
sibling_tag=sel.css("p.name ~ p::text").extract()
print(sibling_tag)

 输出结果:

posted @ 2024-05-20 23:12  leagueandlegends  阅读(27)  评论(0)    收藏  举报