From a3565f17c3de72df48b40d29615e6ed5095d7fe9 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 9 Mar 2021 00:53:56 +0800
Subject: [PATCH] assignment-2.1 version3: API all deployed, CLI initially
 finished

---
 API/Functions.py                    |  38 +++++-
 API/__init__.py                     |   0
 Client/CLI.py                       | 183 ++++++++++++++++++++++++++--
 Client/__init__.py                  |   0
 Crawler/Main.py                     |  48 ++++++--
 Crawler/__init__.py                 |   0
 DataBase/__init__.py                |   0
 DataBase/mongoDB.py                 |  22 +++-
 RegularExpressionParser/Parser.py   |  42 ++++---
 RegularExpressionParser/__init__.py |   0
 Server/SimpleServer.py              |  91 ++++++++++----
 Server/__init__.py                  |   0
 12 files changed, 360 insertions(+), 64 deletions(-)
 create mode 100644 API/__init__.py
 create mode 100644 Client/__init__.py
 create mode 100644 Crawler/__init__.py
 create mode 100644 DataBase/__init__.py
 create mode 100644 RegularExpressionParser/__init__.py
 create mode 100644 Server/__init__.py

diff --git a/API/Functions.py b/API/Functions.py
index 96e4881..452887c 100644
--- a/API/Functions.py
+++ b/API/Functions.py
@@ -1,21 +1,49 @@
 import requests
 import os
-import RegularExpressionParser.Parser as parser
-import re
+import RegularExpressionParser.Parser as Parser
 from dotenv import load_dotenv
 
 
+def safe_url(query):
+    load_dotenv()
+    host = os.getenv('SERVER_HOST')
+    return host + Parser.url_safe(query)
+
+
 def get(query):
     """
     function used to send get request to find one or several documentations
     :param query: user input query
     :return: json file of the given author or book
     """
-    load_dotenv()
-    host = os.getenv('SERVER_HOST')
-    url = host + parser.url_safe(query)
+    print("GET: " + query)
+    url = safe_url(query)
     req = requests.get(url)
+    print("response code: " + str(req.status_code))
     return req.json()
 
 
+def put(query, json_file):
+    print("PUT: " + query)
+    url = safe_url(query)
+    req = requests.put(url, json=json_file)
+    print("response code: " + str(req.status_code))
+    return req.text
+
+
+def post(query, json_file):
+    print("POST: " + query)
+    url = safe_url(query)
+    req = requests.put(url, json=json_file)
+    print("response code: " + str(req.status_code))
+    return req.text
+
+
+def delete(query):
+    print("DELETE: " + query)
+    url = safe_url(query)
+    req = requests.delete(url)
+    print("response code: " + str(req.status_code))
+    return req.text
+
 
diff --git a/API/__init__.py b/API/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Client/CLI.py b/Client/CLI.py
index b3c40d6..8104183 100644
--- a/Client/CLI.py
+++ b/Client/CLI.py
@@ -1,21 +1,186 @@
 import typer
+import RegularExpressionParser.Parser as Parser
+import Crawler.Main as Main
+import API.Functions as API
+import os
+import json
+from dotenv import load_dotenv
 
 app = typer.Typer()
 
+
 @app.command()
-def scrape_book(url: str):
-    """ Function used to create scrape request to Server """
+def main_page():
+    print("Welcome to use goodReads scraper!\n"
+          "This is main page\n"
+          "Usage:\n"
+          "scrape: scrape goodRead and load data to database\n"
+          "get: find a document for a book or an author from database\n"
+          "put: create or update a document in database\n"
+          "post: add one or multiple documents of book or author in database \n"
+          "delete: delete certain document from database\n"
+          "export: export a certain collection from database\n"
+          "exit: end the program")
+    order = input()
+    if order == "scrape":
+        scrape()
+    elif order == "get":
+        get()
+    elif order == "put":
+        put()
+    elif order == "post":
+        post()
+    elif order == "delete":
+        delete()
+    elif order == "export":
+        export()
+    elif order == "exit":
+        exit()
+    else:
+        print("Unknown command")
+    main_page()
 
 
+def scrape():
+    """
+    Function used to create scrape request to Server
+    Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py
+    """
+    print('please enter URL of the book')
+    url = input()
+    print('please enter the number of books you want to crawl')
+    book_num = input()
+    if not book_num.isdigit():
+        print("max book num must be an integer")
+        main_page()
+    book_num = int(book_num)
+    print('please enter the number of authors you want to crawl')
+    author_num = input()
+    if not author_num.isdigit():
+        print("max author num must be an integer")
+        main_page()
+    author_num = int(author_num)
+    print('Request received, start scraping')
+    if not Parser.check_if_address_valid(url):
+        print("Error: invalid URL")
+        main_page()
+    elif book_num >= 2000 or author_num >= 2000:
+        print("Error, the number of book and author should be lower than 2000")
+        main_page()
+    else:
+        if book_num > 200 or author_num > 50:
+            print("Warning, maximum of the number of books or authors is large")
+        Main.scrape_api(url, book_num, author_num)
+        main_page()
 
-@app.command()
-def goodbye(name: str, formal: bool = False):
-    """ function2 """
-    if formal:
-        typer.echo(f"Goodbye Ms. {name}. Have a good day.")
+
+def get():
+    print("Please enter your query:")
+    query = input()
+    result = API.get(query)
+    typer.echo(result)
+    main_page()
+
+
+def put():
+    print("Please enter your query:")
+    query = input()
+    print("Please select one way to load update info: input or read:")
+    load = input()
+    if load == "input":
+        print("Please enter json data:")
+        string = input()
+        try:
+            data = json.loads(string)
+        except ValueError as err:
+            print(err)
+            main_page()
+    elif load == "read":
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        print("Please enter the name of file which is in the file root directory:")
+        file_path = r"" + file_root + input()
+        if os.path.isfile(file_path):
+            with open(file_path, "r") as file:
+                try:
+                    data = json.load(file)
+                except ValueError as err:
+                    print(err)
+                    main_page()
+        else:
+            print("Error: file not exist")
+            main_page()
     else:
-        typer.echo(f"Bye {name}!")
+        print("Unknown command")
+        main_page()
+    result = API.put(query, data)
+    typer.echo(result)
+    main_page()
+
+
+def post():
+    print("Please enter your query:")
+    query = input()
+    load_dotenv()
+    file_root = os.getenv('FILE_ROOT')
+    print("Please enter the name of file which is in the file root directory:")
+    file_path = r"" + file_root + input()
+    if os.path.isfile(file_path):
+        with open(file_path, "r") as file:
+            try:
+                data = json.load(file)
+            except ValueError as err:
+                print(err)
+                main_page()
+    else:
+        print("Error: file not exist")
+        main_page()
+    result = API.post(query, data)
+    typer.echo(result)
+    main_page()
+
+
+def delete():
+    print("Please enter your query:")
+    query = input()
+    result = API.delete(query)
+    typer.echo(result)
+    main_page()
+
+
+def export():
+    print("Which collection do you want to export? books or authors?")
+    collection = input()
+    if collection == "books":
+        json = API.get("book")
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        with open(file_root + "books" + ".json", "w") as output:
+            output.write(json)
+        print("books.json is stored at " + file_root)
+        main_page()
+    elif collection == "authors":
+        json = API.get("author")
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        with open(file_root + "authors" + ".json", "w") as output:
+            output.write(json)
+        print("authors.json is stored at " + file_root)
+        main_page()
+    elif collection == "back":
+        main_page()
+    else:
+        print("Unknown collection")
+        main_page()
+
+
+# def goodbye(name: str, formal: bool = False):
+#     """ function2 """
+#     if formal:
+#         typer.echo(f"Goodbye Ms. {name}. Have a good day.")
+#     else:
+#         typer.echo(f"Bye {name}!")
 
 
 if __name__ == "__main__":
-    app()
\ No newline at end of file
+    app()
diff --git a/Client/__init__.py b/Client/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Crawler/Main.py b/Crawler/Main.py
index 24a7e96..c778b1c 100644
--- a/Crawler/Main.py
+++ b/Crawler/Main.py
@@ -1,18 +1,50 @@
 import Crawler.Spider as Spider
 import time
+import RegularExpressionParser.Parser as Parser
 from DataBase import mongoDB as db
 
 
+def scrape_api(url_api, book_num_api, author_num_api):
+    print('Request received, start scraping')
+    book_dict_api = {}
+    author_dict_api = {}
+    similar_book_dict_api = {}
+    count_api = 1
+
+    print("Crawling book #" + str(count_api))
+    start_page_api = Spider.Spider(url_api)
+    start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
+
+    while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api):
+        all_related_books_api = list(similar_book_dict_api.items())
+        for book_url_pair_api in all_related_books_api:
+            if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api):
+                break
+            book_name_api = book_url_pair_api[0]
+            if book_name_api in book_dict_api:
+                pass
+            else:
+                count_api += 1
+                print("Crawling book #" + str(count_api))
+                book_url_api = book_url_pair_api[1]
+                book_spider_api = Spider.Spider(book_url_api)
+                book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
+
+                # since the network delay from asia to goodread.com is already very high,
+                # a small amount of time of sleep is used here.
+                time.sleep(0.2)
+
+    db.insert_dicts(book_dict_api, 0)
+    db.insert_dicts(author_dict_api, 1)
+    print("Scraping completed")
+
+
 print('please enter URL of the book')
 url = input()
-# url = https://www.goodreads.com/book/show/3735293-clean-code
-# https://www.goodreads.com/book/show/35133922-educated
 print('please enter the number of books you want to crawl')
 book_num = input()
-# bookNum = 3
 print('please enter the number of authors you want to crawl')
 author_num = input()
-# authorNum = 2
 print('Request received, start scraping')
 
 book_dict = {}
@@ -27,17 +59,17 @@ start_page.crawl(book_dict, author_dict, similar_book_dict)
 
 while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
     all_related_books = list(similar_book_dict.items())
-    for bookURLPair in all_related_books:
+    for book_url_pair in all_related_books:
         if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
             break
-        book_name = bookURLPair[0]
+        book_name = book_url_pair[0]
         if book_name in book_dict:
             pass
         else:
             count += 1
             print("Crawling book #" + str(count))
-            bookURL = bookURLPair[1]
-            book_spider = Spider.Spider(bookURL)
+            book_url = book_url_pair[1]
+            book_spider = Spider.Spider(book_url)
             book_spider.crawl(book_dict, author_dict, similar_book_dict)
 
             # since the network delay from asia to goodread.com is already very high,
diff --git a/Crawler/__init__.py b/Crawler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/DataBase/__init__.py b/DataBase/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py
index c4e2222..4095fa3 100644
--- a/DataBase/mongoDB.py
+++ b/DataBase/mongoDB.py
@@ -13,6 +13,18 @@ def get_db():
     return client.get_database("crawler_db")
 
 
+def insert_document(docu, opt):
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    records.insert_one(docu)
+
+
 def insert_dicts(dictionary, opt):
     """
     Insert books or authors collection in database
@@ -49,9 +61,13 @@ def update_dicts(opt, identifier, content):
         records = db.books
     elif opt == 1:
         records = db.authors
-    result = records.update_many(
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    result = records.update_one(
         identifier,
         {"$set": content},
+        upsert=True
     )
     print("matched documentation: " + str(result.matched_count))
     print("modified documentation: " + str(result.modified_count))
@@ -96,8 +112,8 @@ def download_collection(opt, identifier, name):
     """
     json_file = get_documents_json(opt, identifier)
     load_dotenv()
-    root = os.getenv('ROOT')
-    with open(root + name + ".json", "w") as output:
+    file_root = os.getenv('FILE_ROOT')
+    with open(file_root + name + ".json", "w") as output:
         output.write(json_file)
 
 
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
index 27f4fe6..fc6f628 100644
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -49,29 +49,35 @@ def parse_query_to_url(query):
 
 
 def parse_query_to_json(pair):
-    elements = re.findall("[0-9A-Za-z\"._]", pair)
+    print(pair)
+    elements = re.findall("[0-9A-Za-z\"_.]+", pair)
     count = len(elements)
-    print(count)
-    if count != 3 and count != 4:
+    if count != 2:
         print("Failed to parse query: invalid args number")
         return {"wrong": "True"}            # will never be founded in database
-    elif count == 3:
-        # can be A.B: C or A.B: "C"
-        if re.search("\".*\"", elements[2]):
-            return {elements[0] + "." + elements[1]: elements[2]}
-        else:
-            return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
     else:
-        # can be NOT, >, or <
-        if elements[2] == "NOT":
-            return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
-        elif elements[2] == ">":
-            return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
-        elif elements[2] == "<":
-            return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
+        # can be A.B: C or A.B: "C"
+        if re.search(":", pair):
+            if re.search("^[0-9.]*$", elements[1]):
+                return {elements[0].split(".")[1]: float(elements[1])}
+            else:
+                if re.search("\".*\"", elements[1]):
+                    return {elements[0].split(".")[1]: elements[1]}
+                else:
+                    return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
         else:
-            print("Failed to parse query: unknown operator")
-            return {"wrong": "True"}
+            if re.search("NOT", pair):
+                if re.search("^[0-9.]*$", elements[1]):
+                    return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
+                else:
+                    return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
+            elif re.search(">", pair):
+                return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
+            elif re.search("<", pair):
+                return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
+            else:
+                print("Failed to parse query: unknown operator")
+                return {"wrong": "True"}
 
 
 def url_safe(element):
diff --git a/RegularExpressionParser/__init__.py b/RegularExpressionParser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
index c416ace..ae10ffb 100644
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -1,12 +1,12 @@
-import RegularExpressionParser.Parser as parser
-import DataBase.mongoDB as db
+import DataBase.mongoDB as DataBase
+import RegularExpressionParser.Parser as Parser
 import re
 from flask import Flask
 from flask import request
 from urllib.parse import urlparse, parse_qs
 
 app = Flask(__name__)
-app.config["DEBUG"] = True
+# app.config["DEBUG"] = True
 
 
 @app.route("/", methods=['GET'])
@@ -15,32 +15,79 @@ def home():
 
 
 @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
-def data(collection):
+def data_base(collection):
     if request.method == "GET":
-        if collection == "books" or collection == "book":
-            identifier = request.args.to_dict()
-            print(identifier)
-            print(type(identifier))
-            return "200: find"
-        elif collection == "authors" or collection == "author":
-            print(request.url)
-            return "200: find"
+        if collection == "book":
+            url_parsed = urlparse(request.url)
+            qs_parsed = parse_qs(url_parsed.query)
+            # print(qs_parsed["id"])
+            if qs_parsed == {}:
+                return DataBase.get_documents_json(0, {})
+            return search_document(["book.id:" + qs_parsed["id"][0]])
+        elif collection == "author":
+            url_parsed = urlparse(request.url)
+            qs_parsed = parse_qs(url_parsed.query)
+            # print(type(qs_parsed["id"]))
+            if qs_parsed == {}:
+                return DataBase.get_documents_json(1, {})
+            return search_document(["author.id:" + qs_parsed["id"][0]])
         elif collection == "search":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
+            print(qs_parsed)
             return search_document(qs_parsed["q"][0].split("&"))
         else:
-            return "404 not found"
-    # elif request.method == "PUT":
+            return "404: Unknown Collection to GET"
+    elif request.method == "PUT":
+        if request.headers["Content-Type"] != "application/json":
+            return "415: content should be JSON file"
+        json_update_info = request.json
+        if collection == "book":
+            opt = 0
+        elif collection == "author":
+            opt = 1
+        else:
+            return "404: Unknown Collection to GET"
+        DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
+        return "200: PUT succeeded"
+    elif request.method == "POST":
+        if request.headers["Content-Type"] != "application/json":
+            return "415: content should be JSON file"
+        json_file = request.json
+        if collection == "books":
+            DataBase.insert_dicts(json_file, 0)
+        elif collection == "authors":
+            DataBase.insert_dicts(json_file, 1)
+        elif collection == "book":
+            DataBase.insert_document(json_file, 0)
+        elif collection == "author":
+            DataBase.insert_document(json_file, 1)
+        elif collection == "scrape":
+            return "200"
+        else:
+            return "404: Unknown Collection to POST"
+        return "201: POST succeeded"
+    elif request.method == "DELETE":
+        identifier = request.args.to_dict()
+        print(identifier)
+        if collection == "book":
+            opt = 0
+        elif collection == "author":
+            opt = 1
+        else:
+            return "404: Unknown Collection to DELETE"
+        DataBase.clean(opt, identifier)
+        return "200: DELETE succeeded"
 
 
 def search_document(identifiers):
     if len(identifiers) == 1:
-        json_idt = parser.parse_query_to_json(identifiers[0])
+        json_idt = Parser.parse_query_to_json(identifiers[0])
+        print(json_idt)
         if re.search("^book.*", identifiers[0]):
-            return db.get_documents_json(0, json_idt)
+            return DataBase.get_documents_json(0, json_idt)
         else:
-            return db.get_documents_json(1, json_idt)
+            return DataBase.get_documents_json(1, json_idt)
     elif len(identifiers) == 3:
         if re.search("^book.*", identifiers[0]):
             if re.search("^author.*", identifiers[2]):
@@ -54,16 +101,18 @@ def search_document(identifiers):
                 return {}
             else:
                 opt = 1
-        json_idt1 = parser.parse_query_to_json(identifiers[0])
-        json_idt2 = parser.parse_query_to_json(identifiers[2])
+        json_idt1 = Parser.parse_query_to_json(identifiers[0])
+        json_idt2 = Parser.parse_query_to_json(identifiers[2])
         if identifiers[1] == "AND":
             exp = {"$and": [json_idt1, json_idt2]}
         elif identifiers[1] == "OR":
-            exp = {"$or": [json_idt1,json_idt2]}
+            exp = {"$or": [json_idt1, json_idt2]}
         else:
             print("Failed to parse query: unknown operator for identifiers[1]")
             return {}
-        return db.get_documents_json(opt, exp)
+        print("exp:")
+        print(exp)
+        return DataBase.get_documents_json(opt, exp)
     else:
         return "Error, unknown identifiers"
 
diff --git a/Server/__init__.py b/Server/__init__.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab