From 8959e6bee5baa5aa08a643c4bb4409ca3afef3e1 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Mon, 8 Mar 2021 15:48:05 +0800
Subject: [PATCH] assignment-2.1 version2.1: GET API fixed

---
 API/Functions.py                  |  21 ++++++
 Crawler/BookPack.py               |   6 +-
 Crawler/Main.py                   |  51 +++++++++++--
 Crawler/Spider.py                 |  86 +++++++++++++++++----
 DataBase/JsonParser.py            |  53 +++++++++++++
 DataBase/mongoDB.py               | 120 ++++++++++++++++++++++++++++++
 RegularExpressionParser/Parser.py |  78 +++++++++++++++++++
 Server/SimpleServer.py            |  72 ++++++++++++++++++
 8 files changed, 462 insertions(+), 25 deletions(-)

diff --git a/API/Functions.py b/API/Functions.py
index e69de29..96e4881 100644
--- a/API/Functions.py
+++ b/API/Functions.py
@@ -0,0 +1,21 @@
+import requests
+import os
+import RegularExpressionParser.Parser as parser
+import re
+from dotenv import load_dotenv
+
+
+def get(query):
+    """
+    function used to send get request to find one or several documentations
+    :param query: user input query
+    :return: json file of the given author or book
+    """
+    load_dotenv()
+    host = os.getenv('SERVER_HOST')
+    url = host + parser.url_safe(query)
+    req = requests.get(url)
+    return req.json()
+
+
+
diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py
index 17b404e..42d4162 100644
--- a/Crawler/BookPack.py
+++ b/Crawler/BookPack.py
@@ -1,15 +1,15 @@
 
 class BookPackage:
-    def __init__(self, url, title, id, ISBN, author_url, author, rating,
+    def __init__(self, url, title, id, ISBN13, author_url, author, rating,
                  rating_count, review_count, image_url, similar_books):
         self.url = url
         self.title = title
         self.id = id
-        self.ISBN = ISBN
+        self.ISBN13 = ISBN13
         self.author_url = author_url
         self.author = author
         self.rating = rating
         self.rating_count = rating_count
         self.review_count = review_count
         self.image_url = image_url
-        self.similar_books = similar_books
\ No newline at end of file
+        self.similar_books = similar_books
diff --git a/Crawler/Main.py b/Crawler/Main.py
index c839815..24a7e96 100644
--- a/Crawler/Main.py
+++ b/Crawler/Main.py
@@ -1,12 +1,49 @@
 import Crawler.Spider as Spider
-import Crawler.BookPack as BookPack
-import Crawler.AuthorPack as AuthorPack
+import time
+from DataBase import mongoDB as db
 
 
 print('please enter URL of the book')
-# url = input()
-url = 'https://www.goodreads.com/book/show/3735293-clean-code'
-print('URL received, start scraping')
+url = input()
+# url = https://www.goodreads.com/book/show/3735293-clean-code
+# https://www.goodreads.com/book/show/35133922-educated
+print('please enter the number of books you want to crawl')
+book_num = input()
+# bookNum = 3
+print('please enter the number of authors you want to crawl')
+author_num = input()
+# authorNum = 2
+print('Request received, start scraping')
 
-startPage = Spider.Spider(url)
-startPage.scrap()
+book_dict = {}
+author_dict = {}
+similar_book_dict = {}
+count = 1
+
+print("Crawling book #" + str(count))
+start_page = Spider.Spider(url)
+start_page.crawl(book_dict, author_dict, similar_book_dict)
+
+
+while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
+    all_related_books = list(similar_book_dict.items())
+    for bookURLPair in all_related_books:
+        if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
+            break
+        book_name = bookURLPair[0]
+        if book_name in book_dict:
+            pass
+        else:
+            count += 1
+            print("Crawling book #" + str(count))
+            bookURL = bookURLPair[1]
+            book_spider = Spider.Spider(bookURL)
+            book_spider.crawl(book_dict, author_dict, similar_book_dict)
+
+            # since the network delay from asia to goodread.com is already very high,
+            # a small amount of time of sleep is used here.
+            time.sleep(0.2)
+
+db.insert_dicts(book_dict, 0)
+db.insert_dicts(author_dict, 1)
+print("Scraping completed")
diff --git a/Crawler/Spider.py b/Crawler/Spider.py
index 0b46824..a689789 100644
--- a/Crawler/Spider.py
+++ b/Crawler/Spider.py
@@ -1,5 +1,7 @@
 import requests
 from bs4 import BeautifulSoup
+import Crawler.BookPack as BookPack
+import Crawler.AuthorPack as AuthorPack
 
 
 class Spider:
@@ -8,26 +10,80 @@ class Spider:
         self.url = url
         self.soup = None
 
-    def scrap(self):
-        header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+    def crawl(self, book_dict, author_dict, similar_book_dict):
+        header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                                 "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
 
         url = self.url
         res = requests.get(url, headers=header)
-        self.soup = BeautifulSoup(res.text, 'lxml')
+        self.soup = BeautifulSoup(res.text, "lxml")
+        titleAndAuthor = self.soup.title.string.split(" by ")
+        title = titleAndAuthor[0]
+        bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
+        rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
+        rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
+        rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
+        review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
+        ISBN13_raw = self.soup.find("meta", property="books:isbn")
+        if ISBN13_raw["content"] == "null":
+            print("Warning: book '" + title + "' has no ISBN code")
+            ISBN13 = 0
+        else:
+            ISBN13 = int(ISBN13_raw["content"])
+        imageURL = self.soup.find("img", id="coverImage")["src"]
+        author = titleAndAuthor[1]
+        author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
+        authorID = (author_page_link.split("show/")[1]).split(".")[0]
+        see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
+        res_similar = requests.get(see_more_link, headers=header)
+        similar_soup = BeautifulSoup(res_similar.text, 'lxml')
+        similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
+        similar_books = []
+        for i in range(len(similar_books_and_authors)):
+            if i % 2 == 0:
+                # a book
+                similar_book = similar_books_and_authors[i]
+                similar_book_name = similar_book.text
+                book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
+                similar_books.append(similar_book_name)
+                similar_book_dict[similar_book_name] = book_link
 
-        seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
-                                       'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
-                                       'div.bigBoxBody > div > a')[0].get("href")
-        print(seeMoreLink)
+        bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
+                                       review_count, imageURL, similar_books)
+        book_dict[title] = bookPKT
+        res_author = requests.get(author_page_link, headers=header)
+        author_soup = BeautifulSoup(res_author.text, 'lxml')
+        author_rating = float(author_soup.find("span", {"class": "average"}).text)
+        author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
+        author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
+        author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
+        related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)
 
+        related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
+        res_related_authors = requests.get(related_authors_link, headers=header)
+        related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
+        related_authors = []
+        related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
+        for the_author in related_authors_raw:
+            related_author_name = the_author.text
+            if related_author_name == author:
+                pass
+            else:
+                related_authors.append(related_author_name)
+        author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
+        res_author_books = requests.get(author_books_link, headers=header)
+        author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
+        author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
+        author_books = []
+        for book in author_books_raw:
+            book_name = book.findChildren("span", recursive=False)
+            if not book_name:
+                pass
+            else:
+                book_name = book_name[0].text
+                author_books.append(book_name)
+        author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
+                                             author_review_count, author_photo_url, related_authors, author_books)
+        author_dict[author] = author_pkt
 
 
-# print(soup)
-# print(soup.title.string)
-# print(soup.find_all('a'))
-# aTypes = soup.find_all('a')[0]
-# print(aTypes)
-# for obj in aTypes:
-#     if obj.
-# print(soup.)
diff --git a/DataBase/JsonParser.py b/DataBase/JsonParser.py
index e69de29..2a5e524 100644
--- a/DataBase/JsonParser.py
+++ b/DataBase/JsonParser.py
@@ -0,0 +1,53 @@
+
+def parse_book_dict_to_json(dictionary):
+    item_list = list(dictionary.items())
+    return_list = []
+    for items in item_list:
+        book_package = items[1]
+        book_json = parse_book_to_json(book_package)
+        return_list.append(book_json)
+    return return_list
+
+
+def parse_author_dict_to_json(dictionary):
+    item_list = list(dictionary.items())
+    return_list = []
+    for items in item_list:
+        author_package = items[1]
+        author_json = parse_author_to_json(author_package)
+        return_list.append(author_json)
+    return return_list
+
+
+def parse_book_to_json(bookPKT):
+    book_json = {
+        "type": "book",
+        "book_url": bookPKT.url,
+        "title": bookPKT.title,
+        "id": bookPKT.id,
+        "ISBN": bookPKT.ISBN13,
+        "author_url": bookPKT.author_url,
+        "author": bookPKT.author,
+        "rating": bookPKT.rating,
+        "rating_count": bookPKT.rating_count,
+        "review_count": bookPKT.review_count,
+        "image_url": bookPKT.image_url,
+        "similar_books": bookPKT.similar_books
+    }
+    return book_json
+
+
+def parse_author_to_json(authorPKT):
+    author_json = {
+        "type": "author",
+        "name": authorPKT.name,
+        "author_url": authorPKT.url,
+        "id": authorPKT.id,
+        "rating": authorPKT.rating,
+        "rating_count": authorPKT.rating_count,
+        "review_count": authorPKT.review_count,
+        "image_url": authorPKT.image_url,
+        "related_authors": authorPKT.related_authors,
+        "author_books": authorPKT.author_books
+    }
+    return author_json
diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py
index e69de29..c4e2222 100644
--- a/DataBase/mongoDB.py
+++ b/DataBase/mongoDB.py
@@ -0,0 +1,120 @@
+import json
+import DataBase.JsonParser as parser
+import os
+from dotenv import load_dotenv
+from pymongo import MongoClient
+
+
+def get_db():
+    """ return the database of goodReads crawler """
+    load_dotenv()
+    url = os.getenv('MONGODB_URL')
+    client = MongoClient(url)
+    return client.get_database("crawler_db")
+
+
+def insert_dicts(dictionary, opt):
+    """
+    Insert books or authors collection in database
+    :param dictionary: the dictionary to insert to collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :return: no return value
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    json_list = []
+    if opt == 0:
+        json_list = parser.parse_book_dict_to_json(dictionary)
+    elif opt == 1:
+        json_list = parser.parse_author_dict_to_json(dictionary)
+    records.insert_many(json_list)
+
+
+def update_dicts(opt, identifier, content):
+    """
+    Update documentations in a given collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: the identifier of the documentation we want to find
+    :param content: the content to update
+    :return: no return value
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    result = records.update_many(
+        identifier,
+        {"$set": content},
+    )
+    print("matched documentation: " + str(result.matched_count))
+    print("modified documentation: " + str(result.modified_count))
+
+
+def get_documents_json(opt, identifier):
+    """
+    find documentations specified by the identifier and output a json data
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: identifier of the documents we want, {} means locate the whole collection
+    :return: json file of selected documentations
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return json.dumps({})
+    data = records.find(identifier)
+    file = {}
+    if opt == 0:
+        typeName = "books"
+    else:
+        typeName = "authors"
+    file[typeName] = []
+    for item in data:
+        item.pop("_id")
+        file[typeName].append(item)
+    return json.dumps(file)
+
+
+def download_collection(opt, identifier, name):
+    """
+    download books collection or authors collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: identifier of the documents we want to download;
+    empty({}) means selected all documents in given collection
+    :param name: file name of downloaded json
+    :return: JSON file of the collection
+    """
+    json_file = get_documents_json(opt, identifier)
+    load_dotenv()
+    root = os.getenv('ROOT')
+    with open(root + name + ".json", "w") as output:
+        output.write(json_file)
+
+
+def clean(opt, identifier):
+    """
+    delete specific documents in given collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: identifier of the documents we want to delete;
+    empty({}) means selected all documents in given collection
+    :return: no return value
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    records.delete_many(identifier)
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
index e69de29..27f4fe6 100644
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -0,0 +1,78 @@
+import re
+
+
+def check_if_address_valid(address):
+    """
+    Take input address and scrape date
+    :param addr:
+    :return:
+    """
+    x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address)
+    if x:
+        return True
+    else:
+        return False
+
+
+def parse_query_to_url(query):
+    elements = re.findall("[0-9A-Za-z_\"><]+", query)
+    count = len(elements)
+    if count == 3:
+        # can only be A.B:C or wrong
+        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
+        else:
+            print("Invalid query.")
+            return ""
+    elif count == 4:
+        # a pair and one of [NOT, >, <].
+        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
+        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
+        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
+        else:
+            print("Invalid query.")
+            return ""
+    elif 6 <= count <= 8:
+        # AND or OR operator
+        if re.search(".*\sAND\s.*", query):
+            parts = query.split(" AND ")
+            return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1])
+        elif re.search(".*\sOR\s.*", query):
+            parts = query.split(" OR ")
+            return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
+        else:
+            print("Invalid query.")
+            return ""
+
+
+def parse_query_to_json(pair):
+    elements = re.findall("[0-9A-Za-z\"._]", pair)
+    count = len(elements)
+    print(count)
+    if count != 3 and count != 4:
+        print("Failed to parse query: invalid args number")
+        return {"wrong": "True"}            # will never be founded in database
+    elif count == 3:
+        # can be A.B: C or A.B: "C"
+        if re.search("\".*\"", elements[2]):
+            return {elements[0] + "." + elements[1]: elements[2]}
+        else:
+            return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
+    else:
+        # can be NOT, >, or <
+        if elements[2] == "NOT":
+            return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
+        elif elements[2] == ">":
+            return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
+        elif elements[2] == "<":
+            return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
+        else:
+            print("Failed to parse query: unknown operator")
+            return {"wrong": "True"}
+
+
+def url_safe(element):
+    return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
index e69de29..c416ace 100644
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -0,0 +1,72 @@
+import RegularExpressionParser.Parser as parser
+import DataBase.mongoDB as db
+import re
+from flask import Flask
+from flask import request
+from urllib.parse import urlparse, parse_qs
+
+app = Flask(__name__)
+app.config["DEBUG"] = True
+
+
+@app.route("/", methods=['GET'])
+def home():
+    return "200: successfully connected to home page\n"
+
+
+@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
+def data(collection):
+    if request.method == "GET":
+        if collection == "books" or collection == "book":
+            identifier = request.args.to_dict()
+            print(identifier)
+            print(type(identifier))
+            return "200: find"
+        elif collection == "authors" or collection == "author":
+            print(request.url)
+            return "200: find"
+        elif collection == "search":
+            url_parsed = urlparse(request.url)
+            qs_parsed = parse_qs(url_parsed.query)
+            return search_document(qs_parsed["q"][0].split("&"))
+        else:
+            return "404 not found"
+    # elif request.method == "PUT":
+
+
+def search_document(identifiers):
+    if len(identifiers) == 1:
+        json_idt = parser.parse_query_to_json(identifiers[0])
+        if re.search("^book.*", identifiers[0]):
+            return db.get_documents_json(0, json_idt)
+        else:
+            return db.get_documents_json(1, json_idt)
+    elif len(identifiers) == 3:
+        if re.search("^book.*", identifiers[0]):
+            if re.search("^author.*", identifiers[2]):
+                print("Failed to find documentation: two statements are not pointing to the same collection")
+                return {}
+            else:
+                opt = 0
+        else:
+            if re.search("^book.*",identifiers[2]):
+                print("Failed to find documentation: two statements are not pointing to the same collection")
+                return {}
+            else:
+                opt = 1
+        json_idt1 = parser.parse_query_to_json(identifiers[0])
+        json_idt2 = parser.parse_query_to_json(identifiers[2])
+        if identifiers[1] == "AND":
+            exp = {"$and": [json_idt1, json_idt2]}
+        elif identifiers[1] == "OR":
+            exp = {"$or": [json_idt1,json_idt2]}
+        else:
+            print("Failed to parse query: unknown operator for identifiers[1]")
+            return {}
+        return db.get_documents_json(opt, exp)
+    else:
+        return "Error, unknown identifiers"
+
+
+if __name__ == "__main__":
+    app.run()
-- 
GitLab