From e51da8b907a087f9e7a22c30523156d424625b76 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 2 Mar 2021 07:50:31 +0800 Subject: [PATCH] assignment2.0 ver1.0: basic classes added --- Crawler/AuthorPack.py | 13 +++++++++++++ Crawler/BookPack.py | 15 +++++++++++++++ Crawler/Main.py | 12 ++++++++++++ Crawler/Spider.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) create mode 100644 Crawler/AuthorPack.py create mode 100644 Crawler/BookPack.py create mode 100644 Crawler/Main.py create mode 100644 Crawler/Spider.py diff --git a/Crawler/AuthorPack.py b/Crawler/AuthorPack.py new file mode 100644 index 0000000..896b8e8 --- /dev/null +++ b/Crawler/AuthorPack.py @@ -0,0 +1,13 @@ + +class AuthorPackage: + def __init__(self, name, url, id, rating, rating_count, + review_count, image_url, related_authors, author_books): + self.name = name + self.url = url + self.id = id + self.rating = rating + self.rating_count = rating_count + self.review_count = review_count + self.image_url = image_url + self.related_authors = related_authors + self.author_books = author_books diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py new file mode 100644 index 0000000..17b404e --- /dev/null +++ b/Crawler/BookPack.py @@ -0,0 +1,15 @@ + +class BookPackage: + def __init__(self, url, title, id, ISBN, author_url, author, rating, + rating_count, review_count, image_url, similar_books): + self.url = url + self.title = title + self.id = id + self.ISBN = ISBN + self.author_url = author_url + self.author = author + self.rating = rating + self.rating_count = rating_count + self.review_count = review_count + self.image_url = image_url + self.similar_books = similar_books \ No newline at end of file diff --git a/Crawler/Main.py b/Crawler/Main.py new file mode 100644 index 0000000..c839815 --- /dev/null +++ b/Crawler/Main.py @@ -0,0 +1,12 @@ +import Crawler.Spider as Spider +import Crawler.BookPack as BookPack +import Crawler.AuthorPack as AuthorPack + + +print('please enter URL of the book') +# url = input() +url = 'https://www.goodreads.com/book/show/3735293-clean-code' +print('URL received, start scraping') + +startPage = Spider.Spider(url) +startPage.scrap() diff --git a/Crawler/Spider.py b/Crawler/Spider.py new file mode 100644 index 0000000..0b46824 --- /dev/null +++ b/Crawler/Spider.py @@ -0,0 +1,33 @@ +import requests +from bs4 import BeautifulSoup + + +class Spider: + + def __init__(self, url): + self.url = url + self.soup = None + + def scrap(self): + header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} + + url = self.url + res = requests.get(url, headers=header) + self.soup = BeautifulSoup(res.text, 'lxml') + + seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > ' + 'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > ' + 'div.bigBoxBody > div > a')[0].get("href") + print(seeMoreLink) + + + +# print(soup) +# print(soup.title.string) +# print(soup.find_all('a')) +# aTypes = soup.find_all('a')[0] +# print(aTypes) +# for obj in aTypes: +# if obj. +# print(soup.) -- GitLab