From e51da8b907a087f9e7a22c30523156d424625b76 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 2 Mar 2021 07:50:31 +0800
Subject: [PATCH] assignment2.0 ver1.0: basic classes added

---
 Crawler/AuthorPack.py | 13 +++++++++++++
 Crawler/BookPack.py   | 15 +++++++++++++++
 Crawler/Main.py       | 12 ++++++++++++
 Crawler/Spider.py     | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+)
 create mode 100644 Crawler/AuthorPack.py
 create mode 100644 Crawler/BookPack.py
 create mode 100644 Crawler/Main.py
 create mode 100644 Crawler/Spider.py

diff --git a/Crawler/AuthorPack.py b/Crawler/AuthorPack.py
new file mode 100644
index 0000000..896b8e8
--- /dev/null
+++ b/Crawler/AuthorPack.py
@@ -0,0 +1,13 @@
+
+class AuthorPackage:
+    def __init__(self, name, url, id, rating, rating_count,
+                 review_count, image_url, related_authors, author_books):
+        self.name = name
+        self.url = url
+        self.id = id
+        self.rating = rating
+        self.rating_count = rating_count
+        self.review_count = review_count
+        self.image_url = image_url
+        self.related_authors = related_authors
+        self.author_books = author_books
diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py
new file mode 100644
index 0000000..17b404e
--- /dev/null
+++ b/Crawler/BookPack.py
@@ -0,0 +1,15 @@
+
+class BookPackage:
+    def __init__(self, url, title, id, ISBN, author_url, author, rating,
+                 rating_count, review_count, image_url, similar_books):
+        self.url = url
+        self.title = title
+        self.id = id
+        self.ISBN = ISBN
+        self.author_url = author_url
+        self.author = author
+        self.rating = rating
+        self.rating_count = rating_count
+        self.review_count = review_count
+        self.image_url = image_url
+        self.similar_books = similar_books
\ No newline at end of file
diff --git a/Crawler/Main.py b/Crawler/Main.py
new file mode 100644
index 0000000..c839815
--- /dev/null
+++ b/Crawler/Main.py
@@ -0,0 +1,12 @@
+import Crawler.Spider as Spider
+import Crawler.BookPack as BookPack
+import Crawler.AuthorPack as AuthorPack
+
+
+print('please enter URL of the book')
+# url = input()
+url = 'https://www.goodreads.com/book/show/3735293-clean-code'
+print('URL received, start scraping')
+
+startPage = Spider.Spider(url)
+startPage.scrap()
diff --git a/Crawler/Spider.py b/Crawler/Spider.py
new file mode 100644
index 0000000..0b46824
--- /dev/null
+++ b/Crawler/Spider.py
@@ -0,0 +1,33 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+class Spider:
+
+    def __init__(self, url):
+        self.url = url
+        self.soup = None
+
+    def scrap(self):
+        header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                                "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
+
+        url = self.url
+        res = requests.get(url, headers=header)
+        self.soup = BeautifulSoup(res.text, 'lxml')
+
+        seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
+                                       'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
+                                       'div.bigBoxBody > div > a')[0].get("href")
+        print(seeMoreLink)
+
+
+
+# print(soup)
+# print(soup.title.string)
+# print(soup.find_all('a'))
+# aTypes = soup.find_all('a')[0]
+# print(aTypes)
+# for obj in aTypes:
+#     if obj.
+# print(soup.)
-- 
GitLab