练习python+bs4提取likebookmark网址导航的所有链接
from bs4 import BeautifulSoup import re import sqlite3 conn = sqlite3.connect('onenav.db3') print("数据库打开成功") c = conn.cursor() c.execute('''delete from on_links;''') c.execute('''update sqlite_sequence SET seq = 0 where name ='on_links';''') c.execute('''delete from on_categorys;''') c.execute('''update sqlite_sequence SET seq = 0 where name ='on_categorys';''') conn.commit() print("数据清空完毕") f = open('555.txt', encoding='utf-8') html = f.read() f.close() soup = BeautifulSoup(html, 'html.parser') div_tags = soup.find_all('div', {'class': 'site-main-li'}) category_id = 0 for div_tag in div_tags[1:-1]: category_id += 1 category = div_tag.find('div', {'class', 'site-tit'}).get_text().strip() links = div_tag.find_all('div', {'class', 'list siteList'}) print(category) c.execute('''insert into on_categorys (`id`, `name`) values (?, ?);''', (category_id, category)) for link in links: link_id = link.get('data-id') if link_id: url = link['data-links'] title = re.sub(r'<.*>', '', link.find('p', class_="title").get_text().strip()) print('\t'.join([url, title])) c.execute('''insert into on_links (`fid`, `title`, `url`) values (?, ?, ?);''', (category_id, title, url)) print('---') conn.commit() conn.close()
转自: