assignment2.1 version3: part3 finished, start testing

155ae6b1 · zshan2 · a3565f17 · 155ae6b1 · 155ae6b1 · 155ae6b1
Commit 155ae6b1 authored 4 years ago by zshan2
--- a/API/Functions.py
+++ b/API/Functions.py
@@ -19,7 +19,7 @@ def get(query):
    print("GET: " + query)
    url = safe_url(query)
    req = requests.get(url)
-    print("response code: " + str(req.status_code))
+    print("\nresponse code: " + str(req.status_code) + "\n")
    return req.json()


@@ -27,15 +27,15 @@ def put(query, json_file):
    print("PUT: " + query)
    url = safe_url(query)
    req = requests.put(url, json=json_file)
-    print("response code: " + str(req.status_code))
+    print("\nresponse code: " + str(req.status_code) + "\n")
    return req.text


 def post(query, json_file):
    print("POST: " + query)
    url = safe_url(query)
-    req = requests.put(url, json=json_file)
-    print("response code: " + str(req.status_code))
+    req = requests.post(url, json=json_file)
+    print("\nresponse code: " + str(req.status_code) + "\n")
    return req.text


@@ -43,7 +43,7 @@ def delete(query):
    print("DELETE: " + query)
    url = safe_url(query)
    req = requests.delete(url)
-    print("response code: " + str(req.status_code))
+    print("\nresponse code: " + str(req.status_code) + "\n")
    return req.text


--- a/Client/CLI.py
+++ b/Client/CLI.py
 import typer
 import RegularExpressionParser.Parser as Parser
-import Crawler.Main as Main
+import Crawler.Scrape as Scrape
 import API.Functions as API
 import os
 import json
+import re
 from dotenv import load_dotenv

 app = typer.Typer()
@@ -11,20 +12,24 @@ app = typer.Typer()

 @app.command()
 def main_page():
-    print("Welcome to use goodReads scraper!\n"
-          "This is main page\n"
-          "Usage:\n"
-          "scrape: scrape goodRead and load data to database\n"
-          "get: find a document for a book or an author from database\n"
-          "put: create or update a document in database\n"
-          "post: add one or multiple documents of book or author in database \n"
-          "delete: delete certain document from database\n"
-          "export: export a certain collection from database\n"
-          "exit: end the program")
+    """
+    main page of client, directly run this .py to enter and see usage
+    :return: none
+    """
+    print("==================================================================\n\n"
+          "Welcome to use goodReads scraper!\n\n"
+          "This is main page\n\n"
+          "You can input back to return to this page at anytime\n\n"
+          "Usage:\n\n"
+          "get: find a document for a book or an author from database\n\n"
+          "put: create or update a document in database\n\n"
+          "post: add one or multiple documents of book or author in database \n\n"
+          "delete: delete certain document from database\n\n"
+          "export: export a certain collection from database\n\n"
+          "exit: end the program\n\n"
+          "Please enter your command:\n")
    order = input()
-    if order == "scrape":
-        scrape()
-    elif order == "get":
+    if order == "get":
        get()
    elif order == "put":
        put()
@@ -34,152 +39,186 @@ def main_page():
        delete()
    elif order == "export":
        export()
-    elif order == "exit":
+    if order == "exit":
        exit()
    else:
        print("Unknown command")
    main_page()


-def scrape():
+def check_and_back(command):
+    """ check if used in put back and return to main page if so """
+    if command == "back":
+        main_page()
+
+
+def back_to_main_page():
+    """ wait for user to press enter and then return to the main page """
+    print("press enter to return to main page")
+    no_use = input()
+    main_page()
+
+
+def scrape(query):
    """
    Function used to create scrape request to Server
-    Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py
    """
-    print('please enter URL of the book')
-    url = input()
-    print('please enter the number of books you want to crawl')
-    book_num = input()
+    attributes = query.split("?")[1].split("&")
+    print(attributes)
+    url = attributes[0].split("=", 1)[1]
+    book_num = attributes[1].split("=")[1]
    if not book_num.isdigit():
        print("max book num must be an integer")
-        main_page()
+        back_to_main_page()
    book_num = int(book_num)
-    print('please enter the number of authors you want to crawl')
-    author_num = input()
+    author_num = attributes[2].split("=")[1]
    if not author_num.isdigit():
        print("max author num must be an integer")
-        main_page()
+        back_to_main_page()
    author_num = int(author_num)
    print('Request received, start scraping')
    if not Parser.check_if_address_valid(url):
        print("Error: invalid URL")
-        main_page()
-    elif book_num >= 2000 or author_num >= 2000:
-        print("Error, the number of book and author should be lower than 2000")
-        main_page()
+        back_to_main_page()
+    elif book_num >= 2000 or author_num >= 500 or book_num <= 0 or author_num <= 0:
+        print("Error, the number of book and author should be positive and lower than 2000/500")
+        back_to_main_page()
    else:
        if book_num > 200 or author_num > 50:
            print("Warning, maximum of the number of books or authors is large")
-        Main.scrape_api(url, book_num, author_num)
-        main_page()
+        result = API.post(query, {})
+        return result


 def get():
-    print("Please enter your query:")
+    """ GET API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
    query = input()
+    check_and_back(query)
    result = API.get(query)
    typer.echo(result)
-    main_page()
+    print()
+    back_to_main_page()


 def put():
-    print("Please enter your query:")
+    """ PUT API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
    query = input()
-    print("Please select one way to load update info: input or read:")
+    check_and_back(query)
+    print("Please select one way to load update info: input or read:\n")
    load = input()
+    check_and_back(load)
    if load == "input":
-        print("Please enter json data:")
+        print("Please enter json data:\n")
        string = input()
+        check_and_back(string)
        try:
            data = json.loads(string)
        except ValueError as err:
            print(err)
-            main_page()
+            back_to_main_page()
    elif load == "read":
        load_dotenv()
        file_root = os.getenv('FILE_ROOT')
-        print("Please enter the name of file which is in the file root directory:")
-        file_path = r"" + file_root + input()
+        print("Please enter the name of file which is in the file root directory:\n")
+        name = input()
+        check_and_back(name)
+        file_path = r"" + file_root + name
        if os.path.isfile(file_path):
            with open(file_path, "r") as file:
                try:
                    data = json.load(file)
                except ValueError as err:
                    print(err)
-                    main_page()
+                    back_to_main_page()
        else:
            print("Error: file not exist")
-            main_page()
+            back_to_main_page()
    else:
        print("Unknown command")
-        main_page()
+        back_to_main_page()
    result = API.put(query, data)
    typer.echo(result)
-    main_page()
+    print()
+    back_to_main_page()


 def post():
-    print("Please enter your query:")
+    """ POST API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
    query = input()
-    load_dotenv()
-    file_root = os.getenv('FILE_ROOT')
-    print("Please enter the name of file which is in the file root directory:")
-    file_path = r"" + file_root + input()
-    if os.path.isfile(file_path):
-        with open(file_path, "r") as file:
-            try:
-                data = json.load(file)
-            except ValueError as err:
-                print(err)
-                main_page()
+    check_and_back(query)
+    if re.search("scrape", query):
+        result = scrape(query)
+        typer.echo(result)
+        print()
+        back_to_main_page()
    else:
-        print("Error: file not exist")
-        main_page()
-    result = API.post(query, data)
-    typer.echo(result)
-    main_page()
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        print("Please enter the name of file which is in the file root directory:\n")
+        name = input()
+        check_and_back(name)
+        file_path = r"" + file_root + name
+        if os.path.isfile(file_path):
+            with open(file_path, "r") as file:
+                try:
+                    data = json.load(file)
+                except ValueError as err:
+                    print(err)
+                    back_to_main_page()
+        else:
+            print("Error: file not exist")
+            back_to_main_page()
+        result = API.post(query, data)
+        typer.echo(result)
+        print()
+        back_to_main_page()


 def delete():
-    print("Please enter your query:")
+    """ DELETE API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
    query = input()
+    check_and_back(query)
    result = API.delete(query)
    typer.echo(result)
-    main_page()
+    print()
+    back_to_main_page()


 def export():
-    print("Which collection do you want to export? books or authors?")
+    """ export the book or author collection """
+    print("==================================================================\n")
+    print("Which collection do you want to export: books or authors?\n")
    collection = input()
+    check_and_back(collection)
    if collection == "books":
-        json = API.get("book")
+        json_file = API.get("book")
        load_dotenv()
        file_root = os.getenv('FILE_ROOT')
        with open(file_root + "books" + ".json", "w") as output:
-            output.write(json)
+            output.write(json_file)
        print("books.json is stored at " + file_root)
-        main_page()
+        back_to_main_page()
    elif collection == "authors":
-        json = API.get("author")
+        json_file = API.get("author")
        load_dotenv()
        file_root = os.getenv('FILE_ROOT')
        with open(file_root + "authors" + ".json", "w") as output:
-            output.write(json)
+            output.write(json_file)
        print("authors.json is stored at " + file_root)
-        main_page()
+        back_to_main_page()
    elif collection == "back":
-        main_page()
+        back_to_main_page()
    else:
        print("Unknown collection")
-        main_page()
-
-
-# def goodbye(name: str, formal: bool = False):
-#     """ function2 """
-#     if formal:
-#         typer.echo(f"Goodbye Ms. {name}. Have a good day.")
-#     else:
-#         typer.echo(f"Bye {name}!")
+        back_to_main_page()


 if __name__ == "__main__":

--- a/Crawler/Main.py
+++ b/Crawler/Main.py
 import Crawler.Spider as Spider
 import time
-import RegularExpressionParser.Parser as Parser
 from DataBase import mongoDB as db


@@ -15,10 +14,10 @@ def scrape_api(url_api, book_num_api, author_num_api):
    start_page_api = Spider.Spider(url_api)
    start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)

-    while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api):
+    while len(book_dict_api) < int(book_num_api) or len(author_dict_api) < int(author_num_api):
        all_related_books_api = list(similar_book_dict_api.items())
        for book_url_pair_api in all_related_books_api:
-            if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api):
+            if len(book_dict_api) >= int(book_num_api) and len(author_dict_api) >= int(author_num_api):
                break
            book_name_api = book_url_pair_api[0]
            if book_name_api in book_dict_api:
@@ -36,46 +35,4 @@ def scrape_api(url_api, book_num_api, author_num_api):

    db.insert_dicts(book_dict_api, 0)
    db.insert_dicts(author_dict_api, 1)
-    print("Scraping completed")
-
-
-print('please enter URL of the book')
-url = input()
-print('please enter the number of books you want to crawl')
-book_num = input()
-print('please enter the number of authors you want to crawl')
-author_num = input()
-print('Request received, start scraping')
-
-book_dict = {}
-author_dict = {}
-similar_book_dict = {}
-count = 1
-
-print("Crawling book #" + str(count))
-start_page = Spider.Spider(url)
-start_page.crawl(book_dict, author_dict, similar_book_dict)
-
-
-while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
-    all_related_books = list(similar_book_dict.items())
-    for book_url_pair in all_related_books:
-        if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
-            break
-        book_name = book_url_pair[0]
-        if book_name in book_dict:
-            pass
-        else:
-            count += 1
-            print("Crawling book #" + str(count))
-            book_url = book_url_pair[1]
-            book_spider = Spider.Spider(book_url)
-            book_spider.crawl(book_dict, author_dict, similar_book_dict)
-
-            # since the network delay from asia to goodread.com is already very high,
-            # a small amount of time of sleep is used here.
-            time.sleep(0.2)
-
-db.insert_dicts(book_dict, 0)
-db.insert_dicts(author_dict, 1)
-print("Scraping completed")
+    print("Scraping completed")
\ No newline at end of file
--- a/Crawler/Spider.py
+++ b/Crawler/Spider.py
@@ -11,12 +11,21 @@ class Spider:
        self.soup = None

    def crawl(self, book_dict, author_dict, similar_book_dict):
+        """
+        function used to scrape data of a certain book
+        :param book_dict:
+        :param author_dict:
+        :param similar_book_dict:
+        :return: none
+        """
        header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                                "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}

        url = self.url
        res = requests.get(url, headers=header)
        self.soup = BeautifulSoup(res.text, "lxml")
+
+        # parsing data of current book
        titleAndAuthor = self.soup.title.string.split(" by ")
        title = titleAndAuthor[0]
        bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
@@ -36,6 +45,8 @@ class Spider:
        authorID = (author_page_link.split("show/")[1]).split(".")[0]
        see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
        res_similar = requests.get(see_more_link, headers=header)
+
+        # parsing similar books of current book
        similar_soup = BeautifulSoup(res_similar.text, 'lxml')
        similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
        similar_books = []
@@ -51,6 +62,8 @@ class Spider:
        bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
                                       review_count, imageURL, similar_books)
        book_dict[title] = bookPKT
+
+        # parse data of the author of current book
        res_author = requests.get(author_page_link, headers=header)
        author_soup = BeautifulSoup(res_author.text, 'lxml')
        author_rating = float(author_soup.find("span", {"class": "average"}).text)

--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -58,7 +58,7 @@ def parse_query_to_json(pair):
    else:
        # can be A.B: C or A.B: "C"
        if re.search(":", pair):
-            if re.search("^[0-9.]*$", elements[1]):
+            if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
                return {elements[0].split(".")[1]: float(elements[1])}
            else:
                if re.search("\".*\"", elements[1]):
@@ -67,7 +67,7 @@ def parse_query_to_json(pair):
                    return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
        else:
            if re.search("NOT", pair):
-                if re.search("^[0-9.]*$", elements[1]):
+                if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
                    return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
                else:
                    return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}

--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
 import DataBase.mongoDB as DataBase
 import RegularExpressionParser.Parser as Parser
 import re
+import Crawler.Scrape as Scrape
 from flask import Flask
 from flask import request
 from urllib.parse import urlparse, parse_qs
@@ -20,21 +21,18 @@ def data_base(collection):
        if collection == "book":
            url_parsed = urlparse(request.url)
            qs_parsed = parse_qs(url_parsed.query)
-            # print(qs_parsed["id"])
            if qs_parsed == {}:
                return DataBase.get_documents_json(0, {})
            return search_document(["book.id:" + qs_parsed["id"][0]])
        elif collection == "author":
            url_parsed = urlparse(request.url)
            qs_parsed = parse_qs(url_parsed.query)
-            # print(type(qs_parsed["id"]))
            if qs_parsed == {}:
                return DataBase.get_documents_json(1, {})
            return search_document(["author.id:" + qs_parsed["id"][0]])
        elif collection == "search":
            url_parsed = urlparse(request.url)
            qs_parsed = parse_qs(url_parsed.query)
-            print(qs_parsed)
            return search_document(qs_parsed["q"][0].split("&"))
        else:
            return "404: Unknown Collection to GET"
@@ -47,7 +45,7 @@ def data_base(collection):
        elif collection == "author":
            opt = 1
        else:
-            return "404: Unknown Collection to GET"
+            return "404: Unknown Collection to PUT"
        DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
        return "200: PUT succeeded"
    elif request.method == "POST":
@@ -63,7 +61,15 @@ def data_base(collection):
        elif collection == "author":
            DataBase.insert_document(json_file, 1)
        elif collection == "scrape":
-            return "200"
+            # url_parsed = urlparse(request.url)
+            # qs_parsed = parse_qs(url_parsed.query)
+            # print(qs_parsed)
+            param = request.args.to_dict()
+            url = param["url"]
+            max_book = param["max_book"]
+            max_author = param["max_author"]
+            Scrape.scrape_api(url, max_book, max_author)
+            return "201: new data has been added to database"
        else:
            return "404: Unknown Collection to POST"
        return "201: POST succeeded"
@@ -81,6 +87,7 @@ def data_base(collection):


 def search_document(identifiers):
+    """ function used to find one or several document in database """
    if len(identifiers) == 1:
        json_idt = Parser.parse_query_to_json(identifiers[0])
        print(json_idt)
@@ -96,7 +103,7 @@ def search_document(identifiers):
            else:
                opt = 0
        else:
-            if re.search("^book.*",identifiers[2]):
+            if re.search("^book.*", identifiers[2]):
                print("Failed to find documentation: two statements are not pointing to the same collection")
                return {}
            else: