1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| import requests from bs4 import BeautifulSoup
import pandas as pd
def crawling(soup): div = soup.find("div", class_='list_issue') print(div.find_all('a')) result = [] urls = [] for a in div.find_all("a"): urls.append(a['href']) result.append(a.get_text())
df = pd.DataFrame({'news_title': result, "url": urls}) print(df) df.to_csv("newscrawling.csv") print("Crawling Success!")
def main(): CUSTOM_HEADER = { 'referer' : 'https://www.naver.com/', 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36' }
url = 'https://www.naver.com/' req = requests.get(url = url, headers=CUSTOM_HEADER) print(req.status_code) soup = BeautifulSoup(req.text, 'html.parser') print(type(soup)) crawling(soup)
if __name__ == "__main__": main()
|