???????????(Web?Spider)????????(URL)????????????(URL)?????????????????
??????????URL?????????????????(???????????Chrome????????Firefox???????????????????)
-???????????(???https://www.baidu.com/robots.txt?????????????????)
-??????????
-???????
??URL ????????????????Uniform Resource Locator???????????????[]???????
protocol :// hostname[:port] / path / [;parameters][?query]#fragment
???????????
protocol????????????google????https???
hostname[:port]???????????????????????????http????????????80???https?????????443?
path?????????????????????????????????????????????????????????????????????????????????path???
100006079301.html?100003887822.html
?????????????? URL ?????? HTML ???? Python3 ?????? urllib.request ?requests??????
1?request??
- ???pip3 install requests. --- urllib,urllib2 ?????py?????requests?????????????
# **** ???? ****
# ????
# import requests
#
# # ??get????????
#resp=requests.get('https://www.baidu.com')
#
# # ???????
# print(resp.text)
#
# with open('a.html','w',encoding='utf-8') as f:
# f.write(resp.text)
#
#
# # ????????
# print(res.status_code)
??????
1????????? requests.get() ???????????? GET ???
import requests
if __name__ == '__main__':
url= "http://www.baidu.com/"
req = requests.get(url=url)
req.encoding = 'utf-8'
print(req.text)
?????
@?@;!DOCTYPE html@?@;
@?@;!--STATUS OK--@?@;@?@;html@?@; @?@;head@?@;@?@;meta http-equiv=content-type content=text/html;charset=utf-8@?@;@?@;meta http-equiv=X-UA-Compatible content=IE=Edge@?@;@?@;meta content=always name=referrer@?@;@?@;link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css@?@;@?@;title@?@;?????????@?@;/title@?@;@?@;/head@?@; @?@;body link=#0000cc@?@; @?@;div id=wrapper@?@; @?@;div id=head@?@; @?@;div class=head_wrapper@?@; @?@;div class=s_form@?@; @?@;div class=s_form_wrapper@?@; @?@;div id=lg@?@; @?@;img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129@?@; @?@;/div@?@; @?@;form id=form name=f action=//www.baidu.com/s class=fm@?@; @?@;input type=hidden name=bdorz_come value=1@?@; @?@;input type=hidden name=ie value=utf-8@?@; @?@;input type=hidden name=f value=8@?@; @?@;input type=hidden name=rsv_bp value=1@?@; @?@;input type=hidden name=rsv_idx value=1@?@; @?@;input type=hidden name=tn value=baidu@?@;@?@;span class="bg s_ipt_wr"@?@;@?@;input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus@?@;@?@;/span@?@;@?@;span class="bg s_btn_wr"@?@;@?@;input type=submit id=su value=???? class="bg s_btn"@?@;@?@;/span@?@; @?@;/form@?@; @?@;/div@?@; @?@;/div@?@; @?@;div id=u1@?@; @?@;a href=http://news.baidu.com name=tj_trnews class=mnav@?@;??@?@;/a@?@; @?@;a href=http://www.hao123.com name=tj_trhao123 class=mnav@?@;hao123@?@;/a@?@; @?@;a href=http://map.baidu.com name=tj_trmap class=mnav@?@;??@?@;/a@?@; @?@;a href=http://v.baidu.com name=tj_trvideo class=mnav@?@;??@?@;/a@?@; @?@;a href=http://tieba.baidu.com name=tj_trtieba class=mnav@?@;??@?@;/a@?@; @?@;noscript@?@; @?@;a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb@?@;??@?@;/a@?@; @?@;/noscript@?@; @?@;script@?@;document.write('@?@;a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb"@?@;??@?@;/a@?@;');@?@;/script@?@; @?@;a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;"@?@;????@?@;/a@?@; @?@;/div@?@; @?@;/div@?@; @?@;/div@?@; @?@;div id=ftCon@?@; @?@;div id=ftConw@?@; @?@;p id=lh@?@; @?@;a href=http://home.baidu.com@?@;????@?@;/a@?@; @?@;a href=http://ir.baidu.com@?@;About Baidu@?@;/a@?@; @?@;/p@?@; @?@;p id=cp@?@;