From e51da8b907a087f9e7a22c30523156d424625b76 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 2 Mar 2021 07:50:31 +0800
Subject: [PATCH 1/9] assignment2.0 ver1.0: basic classes added

---
 Crawler/AuthorPack.py | 13 +++++++++++++
 Crawler/BookPack.py   | 15 +++++++++++++++
 Crawler/Main.py       | 12 ++++++++++++
 Crawler/Spider.py     | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+)
 create mode 100644 Crawler/AuthorPack.py
 create mode 100644 Crawler/BookPack.py
 create mode 100644 Crawler/Main.py
 create mode 100644 Crawler/Spider.py

diff --git a/Crawler/AuthorPack.py b/Crawler/AuthorPack.py
new file mode 100644
index 0000000..896b8e8
--- /dev/null
+++ b/Crawler/AuthorPack.py
@@ -0,0 +1,13 @@
+
+class AuthorPackage:
+    def __init__(self, name, url, id, rating, rating_count,
+                 review_count, image_url, related_authors, author_books):
+        self.name = name
+        self.url = url
+        self.id = id
+        self.rating = rating
+        self.rating_count = rating_count
+        self.review_count = review_count
+        self.image_url = image_url
+        self.related_authors = related_authors
+        self.author_books = author_books
diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py
new file mode 100644
index 0000000..17b404e
--- /dev/null
+++ b/Crawler/BookPack.py
@@ -0,0 +1,15 @@
+
+class BookPackage:
+    def __init__(self, url, title, id, ISBN, author_url, author, rating,
+                 rating_count, review_count, image_url, similar_books):
+        self.url = url
+        self.title = title
+        self.id = id
+        self.ISBN = ISBN
+        self.author_url = author_url
+        self.author = author
+        self.rating = rating
+        self.rating_count = rating_count
+        self.review_count = review_count
+        self.image_url = image_url
+        self.similar_books = similar_books
\ No newline at end of file
diff --git a/Crawler/Main.py b/Crawler/Main.py
new file mode 100644
index 0000000..c839815
--- /dev/null
+++ b/Crawler/Main.py
@@ -0,0 +1,12 @@
+import Crawler.Spider as Spider
+import Crawler.BookPack as BookPack
+import Crawler.AuthorPack as AuthorPack
+
+
+print('please enter URL of the book')
+# url = input()
+url = 'https://www.goodreads.com/book/show/3735293-clean-code'
+print('URL received, start scraping')
+
+startPage = Spider.Spider(url)
+startPage.scrap()
diff --git a/Crawler/Spider.py b/Crawler/Spider.py
new file mode 100644
index 0000000..0b46824
--- /dev/null
+++ b/Crawler/Spider.py
@@ -0,0 +1,33 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+class Spider:
+
+    def __init__(self, url):
+        self.url = url
+        self.soup = None
+
+    def scrap(self):
+        header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                                "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
+
+        url = self.url
+        res = requests.get(url, headers=header)
+        self.soup = BeautifulSoup(res.text, 'lxml')
+
+        seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
+                                       'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
+                                       'div.bigBoxBody > div > a')[0].get("href")
+        print(seeMoreLink)
+
+
+
+# print(soup)
+# print(soup.title.string)
+# print(soup.find_all('a'))
+# aTypes = soup.find_all('a')[0]
+# print(aTypes)
+# for obj in aTypes:
+#     if obj.
+# print(soup.)
-- 
GitLab


From 186c3acb1bee3d63b43b6021fe700ac7b7e858ed Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 2 Mar 2021 11:51:59 +0800
Subject: [PATCH 2/9] assignment2.0 ver2.0: crawler completed, database package
 added and in progress

---
 DataBase/mongoDB.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 DataBase/mongoDB.py

diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From c2b4b0397f8f97e70cfd191b4d60b427f6137e6a Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 2 Mar 2021 12:59:13 +0800
Subject: [PATCH 3/9] assignment2.0 ver2.1: dataBase uploading ready

---
 DataBase/JsonParser.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 DataBase/JsonParser.py

diff --git a/DataBase/JsonParser.py b/DataBase/JsonParser.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From b2347adc0582f590bda3015bec84e154d623b0d8 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Mon, 8 Mar 2021 01:28:49 +0800
Subject: [PATCH 4/9] assignment2.1 initial commit

---
 CommandLineInterface/CLI.py       | 0
 RegularExpressionParser/Parser.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 CommandLineInterface/CLI.py
 create mode 100644 RegularExpressionParser/Parser.py

diff --git a/CommandLineInterface/CLI.py b/CommandLineInterface/CLI.py
new file mode 100644
index 0000000..e69de29
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From 77982c85941ccb03f81e824b575016418e55aba0 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Mon, 8 Mar 2021 15:21:29 +0800
Subject: [PATCH 5/9] assignment-2.1 version2: query parser finished, GET API
 finished

---
 .../CLI.py => API/Functions.py                |  0
 Client/CLI.py                                 | 21 +++++++++++++++++++
 Server/SimpleServer.py                        |  0
 3 files changed, 21 insertions(+)
 rename CommandLineInterface/CLI.py => API/Functions.py (100%)
 create mode 100644 Client/CLI.py
 create mode 100644 Server/SimpleServer.py

diff --git a/CommandLineInterface/CLI.py b/API/Functions.py
similarity index 100%
rename from CommandLineInterface/CLI.py
rename to API/Functions.py
diff --git a/Client/CLI.py b/Client/CLI.py
new file mode 100644
index 0000000..b3c40d6
--- /dev/null
+++ b/Client/CLI.py
@@ -0,0 +1,21 @@
+import typer
+
+app = typer.Typer()
+
+@app.command()
+def scrape_book(url: str):
+    """ Function used to create scrape request to Server """
+
+
+
+@app.command()
+def goodbye(name: str, formal: bool = False):
+    """ function2 """
+    if formal:
+        typer.echo(f"Goodbye Ms. {name}. Have a good day.")
+    else:
+        typer.echo(f"Bye {name}!")
+
+
+if __name__ == "__main__":
+    app()
\ No newline at end of file
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From 8959e6bee5baa5aa08a643c4bb4409ca3afef3e1 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Mon, 8 Mar 2021 15:48:05 +0800
Subject: [PATCH 6/9] assignment-2.1 version2.1: GET API fixed

---
 API/Functions.py                  |  21 ++++++
 Crawler/BookPack.py               |   6 +-
 Crawler/Main.py                   |  51 +++++++++++--
 Crawler/Spider.py                 |  86 +++++++++++++++++----
 DataBase/JsonParser.py            |  53 +++++++++++++
 DataBase/mongoDB.py               | 120 ++++++++++++++++++++++++++++++
 RegularExpressionParser/Parser.py |  78 +++++++++++++++++++
 Server/SimpleServer.py            |  72 ++++++++++++++++++
 8 files changed, 462 insertions(+), 25 deletions(-)

diff --git a/API/Functions.py b/API/Functions.py
index e69de29..96e4881 100644
--- a/API/Functions.py
+++ b/API/Functions.py
@@ -0,0 +1,21 @@
+import requests
+import os
+import RegularExpressionParser.Parser as parser
+import re
+from dotenv import load_dotenv
+
+
+def get(query):
+    """
+    function used to send get request to find one or several documentations
+    :param query: user input query
+    :return: json file of the given author or book
+    """
+    load_dotenv()
+    host = os.getenv('SERVER_HOST')
+    url = host + parser.url_safe(query)
+    req = requests.get(url)
+    return req.json()
+
+
+
diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py
index 17b404e..42d4162 100644
--- a/Crawler/BookPack.py
+++ b/Crawler/BookPack.py
@@ -1,15 +1,15 @@
 
 class BookPackage:
-    def __init__(self, url, title, id, ISBN, author_url, author, rating,
+    def __init__(self, url, title, id, ISBN13, author_url, author, rating,
                  rating_count, review_count, image_url, similar_books):
         self.url = url
         self.title = title
         self.id = id
-        self.ISBN = ISBN
+        self.ISBN13 = ISBN13
         self.author_url = author_url
         self.author = author
         self.rating = rating
         self.rating_count = rating_count
         self.review_count = review_count
         self.image_url = image_url
-        self.similar_books = similar_books
\ No newline at end of file
+        self.similar_books = similar_books
diff --git a/Crawler/Main.py b/Crawler/Main.py
index c839815..24a7e96 100644
--- a/Crawler/Main.py
+++ b/Crawler/Main.py
@@ -1,12 +1,49 @@
 import Crawler.Spider as Spider
-import Crawler.BookPack as BookPack
-import Crawler.AuthorPack as AuthorPack
+import time
+from DataBase import mongoDB as db
 
 
 print('please enter URL of the book')
-# url = input()
-url = 'https://www.goodreads.com/book/show/3735293-clean-code'
-print('URL received, start scraping')
+url = input()
+# url = https://www.goodreads.com/book/show/3735293-clean-code
+# https://www.goodreads.com/book/show/35133922-educated
+print('please enter the number of books you want to crawl')
+book_num = input()
+# bookNum = 3
+print('please enter the number of authors you want to crawl')
+author_num = input()
+# authorNum = 2
+print('Request received, start scraping')
 
-startPage = Spider.Spider(url)
-startPage.scrap()
+book_dict = {}
+author_dict = {}
+similar_book_dict = {}
+count = 1
+
+print("Crawling book #" + str(count))
+start_page = Spider.Spider(url)
+start_page.crawl(book_dict, author_dict, similar_book_dict)
+
+
+while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
+    all_related_books = list(similar_book_dict.items())
+    for bookURLPair in all_related_books:
+        if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
+            break
+        book_name = bookURLPair[0]
+        if book_name in book_dict:
+            pass
+        else:
+            count += 1
+            print("Crawling book #" + str(count))
+            bookURL = bookURLPair[1]
+            book_spider = Spider.Spider(bookURL)
+            book_spider.crawl(book_dict, author_dict, similar_book_dict)
+
+            # since the network delay from asia to goodread.com is already very high,
+            # a small amount of time of sleep is used here.
+            time.sleep(0.2)
+
+db.insert_dicts(book_dict, 0)
+db.insert_dicts(author_dict, 1)
+print("Scraping completed")
diff --git a/Crawler/Spider.py b/Crawler/Spider.py
index 0b46824..a689789 100644
--- a/Crawler/Spider.py
+++ b/Crawler/Spider.py
@@ -1,5 +1,7 @@
 import requests
 from bs4 import BeautifulSoup
+import Crawler.BookPack as BookPack
+import Crawler.AuthorPack as AuthorPack
 
 
 class Spider:
@@ -8,26 +10,80 @@ class Spider:
         self.url = url
         self.soup = None
 
-    def scrap(self):
-        header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+    def crawl(self, book_dict, author_dict, similar_book_dict):
+        header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                                 "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
 
         url = self.url
         res = requests.get(url, headers=header)
-        self.soup = BeautifulSoup(res.text, 'lxml')
+        self.soup = BeautifulSoup(res.text, "lxml")
+        titleAndAuthor = self.soup.title.string.split(" by ")
+        title = titleAndAuthor[0]
+        bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
+        rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
+        rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
+        rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
+        review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
+        ISBN13_raw = self.soup.find("meta", property="books:isbn")
+        if ISBN13_raw["content"] == "null":
+            print("Warning: book '" + title + "' has no ISBN code")
+            ISBN13 = 0
+        else:
+            ISBN13 = int(ISBN13_raw["content"])
+        imageURL = self.soup.find("img", id="coverImage")["src"]
+        author = titleAndAuthor[1]
+        author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
+        authorID = (author_page_link.split("show/")[1]).split(".")[0]
+        see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
+        res_similar = requests.get(see_more_link, headers=header)
+        similar_soup = BeautifulSoup(res_similar.text, 'lxml')
+        similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
+        similar_books = []
+        for i in range(len(similar_books_and_authors)):
+            if i % 2 == 0:
+                # a book
+                similar_book = similar_books_and_authors[i]
+                similar_book_name = similar_book.text
+                book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
+                similar_books.append(similar_book_name)
+                similar_book_dict[similar_book_name] = book_link
 
-        seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
-                                       'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
-                                       'div.bigBoxBody > div > a')[0].get("href")
-        print(seeMoreLink)
+        bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
+                                       review_count, imageURL, similar_books)
+        book_dict[title] = bookPKT
+        res_author = requests.get(author_page_link, headers=header)
+        author_soup = BeautifulSoup(res_author.text, 'lxml')
+        author_rating = float(author_soup.find("span", {"class": "average"}).text)
+        author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
+        author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
+        author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
+        related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)
 
+        related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
+        res_related_authors = requests.get(related_authors_link, headers=header)
+        related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
+        related_authors = []
+        related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
+        for the_author in related_authors_raw:
+            related_author_name = the_author.text
+            if related_author_name == author:
+                pass
+            else:
+                related_authors.append(related_author_name)
+        author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
+        res_author_books = requests.get(author_books_link, headers=header)
+        author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
+        author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
+        author_books = []
+        for book in author_books_raw:
+            book_name = book.findChildren("span", recursive=False)
+            if not book_name:
+                pass
+            else:
+                book_name = book_name[0].text
+                author_books.append(book_name)
+        author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
+                                             author_review_count, author_photo_url, related_authors, author_books)
+        author_dict[author] = author_pkt
 
 
-# print(soup)
-# print(soup.title.string)
-# print(soup.find_all('a'))
-# aTypes = soup.find_all('a')[0]
-# print(aTypes)
-# for obj in aTypes:
-#     if obj.
-# print(soup.)
diff --git a/DataBase/JsonParser.py b/DataBase/JsonParser.py
index e69de29..2a5e524 100644
--- a/DataBase/JsonParser.py
+++ b/DataBase/JsonParser.py
@@ -0,0 +1,53 @@
+
+def parse_book_dict_to_json(dictionary):
+    item_list = list(dictionary.items())
+    return_list = []
+    for items in item_list:
+        book_package = items[1]
+        book_json = parse_book_to_json(book_package)
+        return_list.append(book_json)
+    return return_list
+
+
+def parse_author_dict_to_json(dictionary):
+    item_list = list(dictionary.items())
+    return_list = []
+    for items in item_list:
+        author_package = items[1]
+        author_json = parse_author_to_json(author_package)
+        return_list.append(author_json)
+    return return_list
+
+
+def parse_book_to_json(bookPKT):
+    book_json = {
+        "type": "book",
+        "book_url": bookPKT.url,
+        "title": bookPKT.title,
+        "id": bookPKT.id,
+        "ISBN": bookPKT.ISBN13,
+        "author_url": bookPKT.author_url,
+        "author": bookPKT.author,
+        "rating": bookPKT.rating,
+        "rating_count": bookPKT.rating_count,
+        "review_count": bookPKT.review_count,
+        "image_url": bookPKT.image_url,
+        "similar_books": bookPKT.similar_books
+    }
+    return book_json
+
+
+def parse_author_to_json(authorPKT):
+    author_json = {
+        "type": "author",
+        "name": authorPKT.name,
+        "author_url": authorPKT.url,
+        "id": authorPKT.id,
+        "rating": authorPKT.rating,
+        "rating_count": authorPKT.rating_count,
+        "review_count": authorPKT.review_count,
+        "image_url": authorPKT.image_url,
+        "related_authors": authorPKT.related_authors,
+        "author_books": authorPKT.author_books
+    }
+    return author_json
diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py
index e69de29..c4e2222 100644
--- a/DataBase/mongoDB.py
+++ b/DataBase/mongoDB.py
@@ -0,0 +1,120 @@
+import json
+import DataBase.JsonParser as parser
+import os
+from dotenv import load_dotenv
+from pymongo import MongoClient
+
+
+def get_db():
+    """ return the database of goodReads crawler """
+    load_dotenv()
+    url = os.getenv('MONGODB_URL')
+    client = MongoClient(url)
+    return client.get_database("crawler_db")
+
+
+def insert_dicts(dictionary, opt):
+    """
+    Insert books or authors collection in database
+    :param dictionary: the dictionary to insert to collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :return: no return value
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    json_list = []
+    if opt == 0:
+        json_list = parser.parse_book_dict_to_json(dictionary)
+    elif opt == 1:
+        json_list = parser.parse_author_dict_to_json(dictionary)
+    records.insert_many(json_list)
+
+
+def update_dicts(opt, identifier, content):
+    """
+    Update documentations in a given collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: the identifier of the documentation we want to find
+    :param content: the content to update
+    :return: no return value
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    result = records.update_many(
+        identifier,
+        {"$set": content},
+    )
+    print("matched documentation: " + str(result.matched_count))
+    print("modified documentation: " + str(result.modified_count))
+
+
+def get_documents_json(opt, identifier):
+    """
+    find documentations specified by the identifier and output a json data
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: identifier of the documents we want, {} means locate the whole collection
+    :return: json file of selected documentations
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return json.dumps({})
+    data = records.find(identifier)
+    file = {}
+    if opt == 0:
+        typeName = "books"
+    else:
+        typeName = "authors"
+    file[typeName] = []
+    for item in data:
+        item.pop("_id")
+        file[typeName].append(item)
+    return json.dumps(file)
+
+
+def download_collection(opt, identifier, name):
+    """
+    download books collection or authors collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: identifier of the documents we want to download;
+    empty({}) means selected all documents in given collection
+    :param name: file name of downloaded json
+    :return: JSON file of the collection
+    """
+    json_file = get_documents_json(opt, identifier)
+    load_dotenv()
+    root = os.getenv('ROOT')
+    with open(root + name + ".json", "w") as output:
+        output.write(json_file)
+
+
+def clean(opt, identifier):
+    """
+    delete specific documents in given collection
+    :param opt: =0 means books collection; =1 means authors collection
+    :param identifier: identifier of the documents we want to delete;
+    empty({}) means selected all documents in given collection
+    :return: no return value
+    """
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    records.delete_many(identifier)
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
index e69de29..27f4fe6 100644
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -0,0 +1,78 @@
+import re
+
+
+def check_if_address_valid(address):
+    """
+    Take input address and scrape date
+    :param addr:
+    :return:
+    """
+    x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address)
+    if x:
+        return True
+    else:
+        return False
+
+
+def parse_query_to_url(query):
+    elements = re.findall("[0-9A-Za-z_\"><]+", query)
+    count = len(elements)
+    if count == 3:
+        # can only be A.B:C or wrong
+        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
+        else:
+            print("Invalid query.")
+            return ""
+    elif count == 4:
+        # a pair and one of [NOT, >, <].
+        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
+        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
+        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
+            return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
+        else:
+            print("Invalid query.")
+            return ""
+    elif 6 <= count <= 8:
+        # AND or OR operator
+        if re.search(".*\sAND\s.*", query):
+            parts = query.split(" AND ")
+            return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1])
+        elif re.search(".*\sOR\s.*", query):
+            parts = query.split(" OR ")
+            return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
+        else:
+            print("Invalid query.")
+            return ""
+
+
+def parse_query_to_json(pair):
+    elements = re.findall("[0-9A-Za-z\"._]", pair)
+    count = len(elements)
+    print(count)
+    if count != 3 and count != 4:
+        print("Failed to parse query: invalid args number")
+        return {"wrong": "True"}            # will never be founded in database
+    elif count == 3:
+        # can be A.B: C or A.B: "C"
+        if re.search("\".*\"", elements[2]):
+            return {elements[0] + "." + elements[1]: elements[2]}
+        else:
+            return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
+    else:
+        # can be NOT, >, or <
+        if elements[2] == "NOT":
+            return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
+        elif elements[2] == ">":
+            return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
+        elif elements[2] == "<":
+            return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
+        else:
+            print("Failed to parse query: unknown operator")
+            return {"wrong": "True"}
+
+
+def url_safe(element):
+    return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
index e69de29..c416ace 100644
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -0,0 +1,72 @@
+import RegularExpressionParser.Parser as parser
+import DataBase.mongoDB as db
+import re
+from flask import Flask
+from flask import request
+from urllib.parse import urlparse, parse_qs
+
+app = Flask(__name__)
+app.config["DEBUG"] = True
+
+
+@app.route("/", methods=['GET'])
+def home():
+    return "200: successfully connected to home page\n"
+
+
+@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
+def data(collection):
+    if request.method == "GET":
+        if collection == "books" or collection == "book":
+            identifier = request.args.to_dict()
+            print(identifier)
+            print(type(identifier))
+            return "200: find"
+        elif collection == "authors" or collection == "author":
+            print(request.url)
+            return "200: find"
+        elif collection == "search":
+            url_parsed = urlparse(request.url)
+            qs_parsed = parse_qs(url_parsed.query)
+            return search_document(qs_parsed["q"][0].split("&"))
+        else:
+            return "404 not found"
+    # elif request.method == "PUT":
+
+
+def search_document(identifiers):
+    if len(identifiers) == 1:
+        json_idt = parser.parse_query_to_json(identifiers[0])
+        if re.search("^book.*", identifiers[0]):
+            return db.get_documents_json(0, json_idt)
+        else:
+            return db.get_documents_json(1, json_idt)
+    elif len(identifiers) == 3:
+        if re.search("^book.*", identifiers[0]):
+            if re.search("^author.*", identifiers[2]):
+                print("Failed to find documentation: two statements are not pointing to the same collection")
+                return {}
+            else:
+                opt = 0
+        else:
+            if re.search("^book.*",identifiers[2]):
+                print("Failed to find documentation: two statements are not pointing to the same collection")
+                return {}
+            else:
+                opt = 1
+        json_idt1 = parser.parse_query_to_json(identifiers[0])
+        json_idt2 = parser.parse_query_to_json(identifiers[2])
+        if identifiers[1] == "AND":
+            exp = {"$and": [json_idt1, json_idt2]}
+        elif identifiers[1] == "OR":
+            exp = {"$or": [json_idt1,json_idt2]}
+        else:
+            print("Failed to parse query: unknown operator for identifiers[1]")
+            return {}
+        return db.get_documents_json(opt, exp)
+    else:
+        return "Error, unknown identifiers"
+
+
+if __name__ == "__main__":
+    app.run()
-- 
GitLab


From a3565f17c3de72df48b40d29615e6ed5095d7fe9 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 9 Mar 2021 00:53:56 +0800
Subject: [PATCH 7/9] assignment-2.1 version3: API all deployed, CLI initially
 finished

---
 API/Functions.py                    |  38 +++++-
 API/__init__.py                     |   0
 Client/CLI.py                       | 183 ++++++++++++++++++++++++++--
 Client/__init__.py                  |   0
 Crawler/Main.py                     |  48 ++++++--
 Crawler/__init__.py                 |   0
 DataBase/__init__.py                |   0
 DataBase/mongoDB.py                 |  22 +++-
 RegularExpressionParser/Parser.py   |  42 ++++---
 RegularExpressionParser/__init__.py |   0
 Server/SimpleServer.py              |  91 ++++++++++----
 Server/__init__.py                  |   0
 12 files changed, 360 insertions(+), 64 deletions(-)
 create mode 100644 API/__init__.py
 create mode 100644 Client/__init__.py
 create mode 100644 Crawler/__init__.py
 create mode 100644 DataBase/__init__.py
 create mode 100644 RegularExpressionParser/__init__.py
 create mode 100644 Server/__init__.py

diff --git a/API/Functions.py b/API/Functions.py
index 96e4881..452887c 100644
--- a/API/Functions.py
+++ b/API/Functions.py
@@ -1,21 +1,49 @@
 import requests
 import os
-import RegularExpressionParser.Parser as parser
-import re
+import RegularExpressionParser.Parser as Parser
 from dotenv import load_dotenv
 
 
+def safe_url(query):
+    load_dotenv()
+    host = os.getenv('SERVER_HOST')
+    return host + Parser.url_safe(query)
+
+
 def get(query):
     """
     function used to send get request to find one or several documentations
     :param query: user input query
     :return: json file of the given author or book
     """
-    load_dotenv()
-    host = os.getenv('SERVER_HOST')
-    url = host + parser.url_safe(query)
+    print("GET: " + query)
+    url = safe_url(query)
     req = requests.get(url)
+    print("response code: " + str(req.status_code))
     return req.json()
 
 
+def put(query, json_file):
+    print("PUT: " + query)
+    url = safe_url(query)
+    req = requests.put(url, json=json_file)
+    print("response code: " + str(req.status_code))
+    return req.text
+
+
+def post(query, json_file):
+    print("POST: " + query)
+    url = safe_url(query)
+    req = requests.put(url, json=json_file)
+    print("response code: " + str(req.status_code))
+    return req.text
+
+
+def delete(query):
+    print("DELETE: " + query)
+    url = safe_url(query)
+    req = requests.delete(url)
+    print("response code: " + str(req.status_code))
+    return req.text
+
 
diff --git a/API/__init__.py b/API/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Client/CLI.py b/Client/CLI.py
index b3c40d6..8104183 100644
--- a/Client/CLI.py
+++ b/Client/CLI.py
@@ -1,21 +1,186 @@
 import typer
+import RegularExpressionParser.Parser as Parser
+import Crawler.Main as Main
+import API.Functions as API
+import os
+import json
+from dotenv import load_dotenv
 
 app = typer.Typer()
 
+
 @app.command()
-def scrape_book(url: str):
-    """ Function used to create scrape request to Server """
+def main_page():
+    print("Welcome to use goodReads scraper!\n"
+          "This is main page\n"
+          "Usage:\n"
+          "scrape: scrape goodRead and load data to database\n"
+          "get: find a document for a book or an author from database\n"
+          "put: create or update a document in database\n"
+          "post: add one or multiple documents of book or author in database \n"
+          "delete: delete certain document from database\n"
+          "export: export a certain collection from database\n"
+          "exit: end the program")
+    order = input()
+    if order == "scrape":
+        scrape()
+    elif order == "get":
+        get()
+    elif order == "put":
+        put()
+    elif order == "post":
+        post()
+    elif order == "delete":
+        delete()
+    elif order == "export":
+        export()
+    elif order == "exit":
+        exit()
+    else:
+        print("Unknown command")
+    main_page()
 
 
+def scrape():
+    """
+    Function used to create scrape request to Server
+    Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py
+    """
+    print('please enter URL of the book')
+    url = input()
+    print('please enter the number of books you want to crawl')
+    book_num = input()
+    if not book_num.isdigit():
+        print("max book num must be an integer")
+        main_page()
+    book_num = int(book_num)
+    print('please enter the number of authors you want to crawl')
+    author_num = input()
+    if not author_num.isdigit():
+        print("max author num must be an integer")
+        main_page()
+    author_num = int(author_num)
+    print('Request received, start scraping')
+    if not Parser.check_if_address_valid(url):
+        print("Error: invalid URL")
+        main_page()
+    elif book_num >= 2000 or author_num >= 2000:
+        print("Error, the number of book and author should be lower than 2000")
+        main_page()
+    else:
+        if book_num > 200 or author_num > 50:
+            print("Warning, maximum of the number of books or authors is large")
+        Main.scrape_api(url, book_num, author_num)
+        main_page()
 
-@app.command()
-def goodbye(name: str, formal: bool = False):
-    """ function2 """
-    if formal:
-        typer.echo(f"Goodbye Ms. {name}. Have a good day.")
+
+def get():
+    print("Please enter your query:")
+    query = input()
+    result = API.get(query)
+    typer.echo(result)
+    main_page()
+
+
+def put():
+    print("Please enter your query:")
+    query = input()
+    print("Please select one way to load update info: input or read:")
+    load = input()
+    if load == "input":
+        print("Please enter json data:")
+        string = input()
+        try:
+            data = json.loads(string)
+        except ValueError as err:
+            print(err)
+            main_page()
+    elif load == "read":
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        print("Please enter the name of file which is in the file root directory:")
+        file_path = r"" + file_root + input()
+        if os.path.isfile(file_path):
+            with open(file_path, "r") as file:
+                try:
+                    data = json.load(file)
+                except ValueError as err:
+                    print(err)
+                    main_page()
+        else:
+            print("Error: file not exist")
+            main_page()
     else:
-        typer.echo(f"Bye {name}!")
+        print("Unknown command")
+        main_page()
+    result = API.put(query, data)
+    typer.echo(result)
+    main_page()
+
+
+def post():
+    print("Please enter your query:")
+    query = input()
+    load_dotenv()
+    file_root = os.getenv('FILE_ROOT')
+    print("Please enter the name of file which is in the file root directory:")
+    file_path = r"" + file_root + input()
+    if os.path.isfile(file_path):
+        with open(file_path, "r") as file:
+            try:
+                data = json.load(file)
+            except ValueError as err:
+                print(err)
+                main_page()
+    else:
+        print("Error: file not exist")
+        main_page()
+    result = API.post(query, data)
+    typer.echo(result)
+    main_page()
+
+
+def delete():
+    print("Please enter your query:")
+    query = input()
+    result = API.delete(query)
+    typer.echo(result)
+    main_page()
+
+
+def export():
+    print("Which collection do you want to export? books or authors?")
+    collection = input()
+    if collection == "books":
+        json = API.get("book")
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        with open(file_root + "books" + ".json", "w") as output:
+            output.write(json)
+        print("books.json is stored at " + file_root)
+        main_page()
+    elif collection == "authors":
+        json = API.get("author")
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        with open(file_root + "authors" + ".json", "w") as output:
+            output.write(json)
+        print("authors.json is stored at " + file_root)
+        main_page()
+    elif collection == "back":
+        main_page()
+    else:
+        print("Unknown collection")
+        main_page()
+
+
+# def goodbye(name: str, formal: bool = False):
+#     """ function2 """
+#     if formal:
+#         typer.echo(f"Goodbye Ms. {name}. Have a good day.")
+#     else:
+#         typer.echo(f"Bye {name}!")
 
 
 if __name__ == "__main__":
-    app()
\ No newline at end of file
+    app()
diff --git a/Client/__init__.py b/Client/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Crawler/Main.py b/Crawler/Main.py
index 24a7e96..c778b1c 100644
--- a/Crawler/Main.py
+++ b/Crawler/Main.py
@@ -1,18 +1,50 @@
 import Crawler.Spider as Spider
 import time
+import RegularExpressionParser.Parser as Parser
 from DataBase import mongoDB as db
 
 
+def scrape_api(url_api, book_num_api, author_num_api):
+    print('Request received, start scraping')
+    book_dict_api = {}
+    author_dict_api = {}
+    similar_book_dict_api = {}
+    count_api = 1
+
+    print("Crawling book #" + str(count_api))
+    start_page_api = Spider.Spider(url_api)
+    start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
+
+    while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api):
+        all_related_books_api = list(similar_book_dict_api.items())
+        for book_url_pair_api in all_related_books_api:
+            if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api):
+                break
+            book_name_api = book_url_pair_api[0]
+            if book_name_api in book_dict_api:
+                pass
+            else:
+                count_api += 1
+                print("Crawling book #" + str(count_api))
+                book_url_api = book_url_pair_api[1]
+                book_spider_api = Spider.Spider(book_url_api)
+                book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
+
+                # since the network delay from asia to goodread.com is already very high,
+                # a small amount of time of sleep is used here.
+                time.sleep(0.2)
+
+    db.insert_dicts(book_dict_api, 0)
+    db.insert_dicts(author_dict_api, 1)
+    print("Scraping completed")
+
+
 print('please enter URL of the book')
 url = input()
-# url = https://www.goodreads.com/book/show/3735293-clean-code
-# https://www.goodreads.com/book/show/35133922-educated
 print('please enter the number of books you want to crawl')
 book_num = input()
-# bookNum = 3
 print('please enter the number of authors you want to crawl')
 author_num = input()
-# authorNum = 2
 print('Request received, start scraping')
 
 book_dict = {}
@@ -27,17 +59,17 @@ start_page.crawl(book_dict, author_dict, similar_book_dict)
 
 while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
     all_related_books = list(similar_book_dict.items())
-    for bookURLPair in all_related_books:
+    for book_url_pair in all_related_books:
         if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
             break
-        book_name = bookURLPair[0]
+        book_name = book_url_pair[0]
         if book_name in book_dict:
             pass
         else:
             count += 1
             print("Crawling book #" + str(count))
-            bookURL = bookURLPair[1]
-            book_spider = Spider.Spider(bookURL)
+            book_url = book_url_pair[1]
+            book_spider = Spider.Spider(book_url)
             book_spider.crawl(book_dict, author_dict, similar_book_dict)
 
             # since the network delay from asia to goodread.com is already very high,
diff --git a/Crawler/__init__.py b/Crawler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/DataBase/__init__.py b/DataBase/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py
index c4e2222..4095fa3 100644
--- a/DataBase/mongoDB.py
+++ b/DataBase/mongoDB.py
@@ -13,6 +13,18 @@ def get_db():
     return client.get_database("crawler_db")
 
 
+def insert_document(docu, opt):
+    db = get_db()
+    if opt == 0:
+        records = db.books
+    elif opt == 1:
+        records = db.authors
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    records.insert_one(docu)
+
+
 def insert_dicts(dictionary, opt):
     """
     Insert books or authors collection in database
@@ -49,9 +61,13 @@ def update_dicts(opt, identifier, content):
         records = db.books
     elif opt == 1:
         records = db.authors
-    result = records.update_many(
+    else:
+        print("failed to get json file: wrong opt for selecting collection")
+        return
+    result = records.update_one(
         identifier,
         {"$set": content},
+        upsert=True
     )
     print("matched documentation: " + str(result.matched_count))
     print("modified documentation: " + str(result.modified_count))
@@ -96,8 +112,8 @@ def download_collection(opt, identifier, name):
     """
     json_file = get_documents_json(opt, identifier)
     load_dotenv()
-    root = os.getenv('ROOT')
-    with open(root + name + ".json", "w") as output:
+    file_root = os.getenv('FILE_ROOT')
+    with open(file_root + name + ".json", "w") as output:
         output.write(json_file)
 
 
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
index 27f4fe6..fc6f628 100644
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -49,29 +49,35 @@ def parse_query_to_url(query):
 
 
 def parse_query_to_json(pair):
-    elements = re.findall("[0-9A-Za-z\"._]", pair)
+    print(pair)
+    elements = re.findall("[0-9A-Za-z\"_.]+", pair)
     count = len(elements)
-    print(count)
-    if count != 3 and count != 4:
+    if count != 2:
         print("Failed to parse query: invalid args number")
         return {"wrong": "True"}            # will never be founded in database
-    elif count == 3:
-        # can be A.B: C or A.B: "C"
-        if re.search("\".*\"", elements[2]):
-            return {elements[0] + "." + elements[1]: elements[2]}
-        else:
-            return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
     else:
-        # can be NOT, >, or <
-        if elements[2] == "NOT":
-            return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
-        elif elements[2] == ">":
-            return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
-        elif elements[2] == "<":
-            return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
+        # can be A.B: C or A.B: "C"
+        if re.search(":", pair):
+            if re.search("^[0-9.]*$", elements[1]):
+                return {elements[0].split(".")[1]: float(elements[1])}
+            else:
+                if re.search("\".*\"", elements[1]):
+                    return {elements[0].split(".")[1]: elements[1]}
+                else:
+                    return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
         else:
-            print("Failed to parse query: unknown operator")
-            return {"wrong": "True"}
+            if re.search("NOT", pair):
+                if re.search("^[0-9.]*$", elements[1]):
+                    return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
+                else:
+                    return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
+            elif re.search(">", pair):
+                return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
+            elif re.search("<", pair):
+                return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
+            else:
+                print("Failed to parse query: unknown operator")
+                return {"wrong": "True"}
 
 
 def url_safe(element):
diff --git a/RegularExpressionParser/__init__.py b/RegularExpressionParser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
index c416ace..ae10ffb 100644
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -1,12 +1,12 @@
-import RegularExpressionParser.Parser as parser
-import DataBase.mongoDB as db
+import DataBase.mongoDB as DataBase
+import RegularExpressionParser.Parser as Parser
 import re
 from flask import Flask
 from flask import request
 from urllib.parse import urlparse, parse_qs
 
 app = Flask(__name__)
-app.config["DEBUG"] = True
+# app.config["DEBUG"] = True
 
 
 @app.route("/", methods=['GET'])
@@ -15,32 +15,79 @@ def home():
 
 
 @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
-def data(collection):
+def data_base(collection):
     if request.method == "GET":
-        if collection == "books" or collection == "book":
-            identifier = request.args.to_dict()
-            print(identifier)
-            print(type(identifier))
-            return "200: find"
-        elif collection == "authors" or collection == "author":
-            print(request.url)
-            return "200: find"
+        if collection == "book":
+            url_parsed = urlparse(request.url)
+            qs_parsed = parse_qs(url_parsed.query)
+            # print(qs_parsed["id"])
+            if qs_parsed == {}:
+                return DataBase.get_documents_json(0, {})
+            return search_document(["book.id:" + qs_parsed["id"][0]])
+        elif collection == "author":
+            url_parsed = urlparse(request.url)
+            qs_parsed = parse_qs(url_parsed.query)
+            # print(type(qs_parsed["id"]))
+            if qs_parsed == {}:
+                return DataBase.get_documents_json(1, {})
+            return search_document(["author.id:" + qs_parsed["id"][0]])
         elif collection == "search":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
+            print(qs_parsed)
             return search_document(qs_parsed["q"][0].split("&"))
         else:
-            return "404 not found"
-    # elif request.method == "PUT":
+            return "404: Unknown Collection to GET"
+    elif request.method == "PUT":
+        if request.headers["Content-Type"] != "application/json":
+            return "415: content should be JSON file"
+        json_update_info = request.json
+        if collection == "book":
+            opt = 0
+        elif collection == "author":
+            opt = 1
+        else:
+            return "404: Unknown Collection to GET"
+        DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
+        return "200: PUT succeeded"
+    elif request.method == "POST":
+        if request.headers["Content-Type"] != "application/json":
+            return "415: content should be JSON file"
+        json_file = request.json
+        if collection == "books":
+            DataBase.insert_dicts(json_file, 0)
+        elif collection == "authors":
+            DataBase.insert_dicts(json_file, 1)
+        elif collection == "book":
+            DataBase.insert_document(json_file, 0)
+        elif collection == "author":
+            DataBase.insert_document(json_file, 1)
+        elif collection == "scrape":
+            return "200"
+        else:
+            return "404: Unknown Collection to POST"
+        return "201: POST succeeded"
+    elif request.method == "DELETE":
+        identifier = request.args.to_dict()
+        print(identifier)
+        if collection == "book":
+            opt = 0
+        elif collection == "author":
+            opt = 1
+        else:
+            return "404: Unknown Collection to DELETE"
+        DataBase.clean(opt, identifier)
+        return "200: DELETE succeeded"
 
 
 def search_document(identifiers):
     if len(identifiers) == 1:
-        json_idt = parser.parse_query_to_json(identifiers[0])
+        json_idt = Parser.parse_query_to_json(identifiers[0])
+        print(json_idt)
         if re.search("^book.*", identifiers[0]):
-            return db.get_documents_json(0, json_idt)
+            return DataBase.get_documents_json(0, json_idt)
         else:
-            return db.get_documents_json(1, json_idt)
+            return DataBase.get_documents_json(1, json_idt)
     elif len(identifiers) == 3:
         if re.search("^book.*", identifiers[0]):
             if re.search("^author.*", identifiers[2]):
@@ -54,16 +101,18 @@ def search_document(identifiers):
                 return {}
             else:
                 opt = 1
-        json_idt1 = parser.parse_query_to_json(identifiers[0])
-        json_idt2 = parser.parse_query_to_json(identifiers[2])
+        json_idt1 = Parser.parse_query_to_json(identifiers[0])
+        json_idt2 = Parser.parse_query_to_json(identifiers[2])
         if identifiers[1] == "AND":
             exp = {"$and": [json_idt1, json_idt2]}
         elif identifiers[1] == "OR":
-            exp = {"$or": [json_idt1,json_idt2]}
+            exp = {"$or": [json_idt1, json_idt2]}
         else:
             print("Failed to parse query: unknown operator for identifiers[1]")
             return {}
-        return db.get_documents_json(opt, exp)
+        print("exp:")
+        print(exp)
+        return DataBase.get_documents_json(opt, exp)
     else:
         return "Error, unknown identifiers"
 
diff --git a/Server/__init__.py b/Server/__init__.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From 155ae6b1e1c409b2eb4291fef1a240ca8ad54924 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 9 Mar 2021 05:49:02 +0800
Subject: [PATCH 8/9] assignment2.1 version3: part3 finished, start testing

---
 API/Functions.py                  |  10 +-
 Client/CLI.py                     | 197 ++++++++++++++++++------------
 Crawler/Main.py                   |  81 ------------
 Crawler/Scrape.py                 |  38 ++++++
 Crawler/Spider.py                 |  13 ++
 RegularExpressionParser/Parser.py |   4 +-
 Server/SimpleServer.py            |  19 ++-
 7 files changed, 189 insertions(+), 173 deletions(-)
 delete mode 100644 Crawler/Main.py
 create mode 100644 Crawler/Scrape.py

diff --git a/API/Functions.py b/API/Functions.py
index 452887c..4858e26 100644
--- a/API/Functions.py
+++ b/API/Functions.py
@@ -19,7 +19,7 @@ def get(query):
     print("GET: " + query)
     url = safe_url(query)
     req = requests.get(url)
-    print("response code: " + str(req.status_code))
+    print("\nresponse code: " + str(req.status_code) + "\n")
     return req.json()
 
 
@@ -27,15 +27,15 @@ def put(query, json_file):
     print("PUT: " + query)
     url = safe_url(query)
     req = requests.put(url, json=json_file)
-    print("response code: " + str(req.status_code))
+    print("\nresponse code: " + str(req.status_code) + "\n")
     return req.text
 
 
 def post(query, json_file):
     print("POST: " + query)
     url = safe_url(query)
-    req = requests.put(url, json=json_file)
-    print("response code: " + str(req.status_code))
+    req = requests.post(url, json=json_file)
+    print("\nresponse code: " + str(req.status_code) + "\n")
     return req.text
 
 
@@ -43,7 +43,7 @@ def delete(query):
     print("DELETE: " + query)
     url = safe_url(query)
     req = requests.delete(url)
-    print("response code: " + str(req.status_code))
+    print("\nresponse code: " + str(req.status_code) + "\n")
     return req.text
 
 
diff --git a/Client/CLI.py b/Client/CLI.py
index 8104183..3bb87ef 100644
--- a/Client/CLI.py
+++ b/Client/CLI.py
@@ -1,9 +1,10 @@
 import typer
 import RegularExpressionParser.Parser as Parser
-import Crawler.Main as Main
+import Crawler.Scrape as Scrape
 import API.Functions as API
 import os
 import json
+import re
 from dotenv import load_dotenv
 
 app = typer.Typer()
@@ -11,20 +12,24 @@ app = typer.Typer()
 
 @app.command()
 def main_page():
-    print("Welcome to use goodReads scraper!\n"
-          "This is main page\n"
-          "Usage:\n"
-          "scrape: scrape goodRead and load data to database\n"
-          "get: find a document for a book or an author from database\n"
-          "put: create or update a document in database\n"
-          "post: add one or multiple documents of book or author in database \n"
-          "delete: delete certain document from database\n"
-          "export: export a certain collection from database\n"
-          "exit: end the program")
+    """
+    main page of client, directly run this .py to enter and see usage
+    :return: none
+    """
+    print("==================================================================\n\n"
+          "Welcome to use goodReads scraper!\n\n"
+          "This is main page\n\n"
+          "You can input back to return to this page at anytime\n\n"
+          "Usage:\n\n"
+          "get: find a document for a book or an author from database\n\n"
+          "put: create or update a document in database\n\n"
+          "post: add one or multiple documents of book or author in database \n\n"
+          "delete: delete certain document from database\n\n"
+          "export: export a certain collection from database\n\n"
+          "exit: end the program\n\n"
+          "Please enter your command:\n")
     order = input()
-    if order == "scrape":
-        scrape()
-    elif order == "get":
+    if order == "get":
         get()
     elif order == "put":
         put()
@@ -34,152 +39,186 @@ def main_page():
         delete()
     elif order == "export":
         export()
-    elif order == "exit":
+    if order == "exit":
         exit()
     else:
         print("Unknown command")
     main_page()
 
 
-def scrape():
+def check_and_back(command):
+    """ check if used in put back and return to main page if so """
+    if command == "back":
+        main_page()
+
+
+def back_to_main_page():
+    """ wait for user to press enter and then return to the main page """
+    print("press enter to return to main page")
+    no_use = input()
+    main_page()
+
+
+def scrape(query):
     """
     Function used to create scrape request to Server
-    Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py
     """
-    print('please enter URL of the book')
-    url = input()
-    print('please enter the number of books you want to crawl')
-    book_num = input()
+    attributes = query.split("?")[1].split("&")
+    print(attributes)
+    url = attributes[0].split("=", 1)[1]
+    book_num = attributes[1].split("=")[1]
     if not book_num.isdigit():
         print("max book num must be an integer")
-        main_page()
+        back_to_main_page()
     book_num = int(book_num)
-    print('please enter the number of authors you want to crawl')
-    author_num = input()
+    author_num = attributes[2].split("=")[1]
     if not author_num.isdigit():
         print("max author num must be an integer")
-        main_page()
+        back_to_main_page()
     author_num = int(author_num)
     print('Request received, start scraping')
     if not Parser.check_if_address_valid(url):
         print("Error: invalid URL")
-        main_page()
-    elif book_num >= 2000 or author_num >= 2000:
-        print("Error, the number of book and author should be lower than 2000")
-        main_page()
+        back_to_main_page()
+    elif book_num >= 2000 or author_num >= 500 or book_num <= 0 or author_num <= 0:
+        print("Error, the number of book and author should be positive and lower than 2000/500")
+        back_to_main_page()
     else:
         if book_num > 200 or author_num > 50:
             print("Warning, maximum of the number of books or authors is large")
-        Main.scrape_api(url, book_num, author_num)
-        main_page()
+        result = API.post(query, {})
+        return result
 
 
 def get():
-    print("Please enter your query:")
+    """ GET API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
     query = input()
+    check_and_back(query)
     result = API.get(query)
     typer.echo(result)
-    main_page()
+    print()
+    back_to_main_page()
 
 
 def put():
-    print("Please enter your query:")
+    """ PUT API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
     query = input()
-    print("Please select one way to load update info: input or read:")
+    check_and_back(query)
+    print("Please select one way to load update info: input or read:\n")
     load = input()
+    check_and_back(load)
     if load == "input":
-        print("Please enter json data:")
+        print("Please enter json data:\n")
         string = input()
+        check_and_back(string)
         try:
             data = json.loads(string)
         except ValueError as err:
             print(err)
-            main_page()
+            back_to_main_page()
     elif load == "read":
         load_dotenv()
         file_root = os.getenv('FILE_ROOT')
-        print("Please enter the name of file which is in the file root directory:")
-        file_path = r"" + file_root + input()
+        print("Please enter the name of file which is in the file root directory:\n")
+        name = input()
+        check_and_back(name)
+        file_path = r"" + file_root + name
         if os.path.isfile(file_path):
             with open(file_path, "r") as file:
                 try:
                     data = json.load(file)
                 except ValueError as err:
                     print(err)
-                    main_page()
+                    back_to_main_page()
         else:
             print("Error: file not exist")
-            main_page()
+            back_to_main_page()
     else:
         print("Unknown command")
-        main_page()
+        back_to_main_page()
     result = API.put(query, data)
     typer.echo(result)
-    main_page()
+    print()
+    back_to_main_page()
 
 
 def post():
-    print("Please enter your query:")
+    """ POST API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
     query = input()
-    load_dotenv()
-    file_root = os.getenv('FILE_ROOT')
-    print("Please enter the name of file which is in the file root directory:")
-    file_path = r"" + file_root + input()
-    if os.path.isfile(file_path):
-        with open(file_path, "r") as file:
-            try:
-                data = json.load(file)
-            except ValueError as err:
-                print(err)
-                main_page()
+    check_and_back(query)
+    if re.search("scrape", query):
+        result = scrape(query)
+        typer.echo(result)
+        print()
+        back_to_main_page()
     else:
-        print("Error: file not exist")
-        main_page()
-    result = API.post(query, data)
-    typer.echo(result)
-    main_page()
+        load_dotenv()
+        file_root = os.getenv('FILE_ROOT')
+        print("Please enter the name of file which is in the file root directory:\n")
+        name = input()
+        check_and_back(name)
+        file_path = r"" + file_root + name
+        if os.path.isfile(file_path):
+            with open(file_path, "r") as file:
+                try:
+                    data = json.load(file)
+                except ValueError as err:
+                    print(err)
+                    back_to_main_page()
+        else:
+            print("Error: file not exist")
+            back_to_main_page()
+        result = API.post(query, data)
+        typer.echo(result)
+        print()
+        back_to_main_page()
 
 
 def delete():
-    print("Please enter your query:")
+    """ DELETE API """
+    print("==================================================================\n")
+    print("Please enter your query:\n")
     query = input()
+    check_and_back(query)
     result = API.delete(query)
     typer.echo(result)
-    main_page()
+    print()
+    back_to_main_page()
 
 
 def export():
-    print("Which collection do you want to export? books or authors?")
+    """ export the book or author collection """
+    print("==================================================================\n")
+    print("Which collection do you want to export: books or authors?\n")
     collection = input()
+    check_and_back(collection)
     if collection == "books":
-        json = API.get("book")
+        json_file = API.get("book")
         load_dotenv()
         file_root = os.getenv('FILE_ROOT')
         with open(file_root + "books" + ".json", "w") as output:
-            output.write(json)
+            output.write(json_file)
         print("books.json is stored at " + file_root)
-        main_page()
+        back_to_main_page()
     elif collection == "authors":
-        json = API.get("author")
+        json_file = API.get("author")
         load_dotenv()
         file_root = os.getenv('FILE_ROOT')
         with open(file_root + "authors" + ".json", "w") as output:
-            output.write(json)
+            output.write(json_file)
         print("authors.json is stored at " + file_root)
-        main_page()
+        back_to_main_page()
     elif collection == "back":
-        main_page()
+        back_to_main_page()
     else:
         print("Unknown collection")
-        main_page()
-
-
-# def goodbye(name: str, formal: bool = False):
-#     """ function2 """
-#     if formal:
-#         typer.echo(f"Goodbye Ms. {name}. Have a good day.")
-#     else:
-#         typer.echo(f"Bye {name}!")
+        back_to_main_page()
 
 
 if __name__ == "__main__":
diff --git a/Crawler/Main.py b/Crawler/Main.py
deleted file mode 100644
index c778b1c..0000000
--- a/Crawler/Main.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import Crawler.Spider as Spider
-import time
-import RegularExpressionParser.Parser as Parser
-from DataBase import mongoDB as db
-
-
-def scrape_api(url_api, book_num_api, author_num_api):
-    print('Request received, start scraping')
-    book_dict_api = {}
-    author_dict_api = {}
-    similar_book_dict_api = {}
-    count_api = 1
-
-    print("Crawling book #" + str(count_api))
-    start_page_api = Spider.Spider(url_api)
-    start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
-
-    while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api):
-        all_related_books_api = list(similar_book_dict_api.items())
-        for book_url_pair_api in all_related_books_api:
-            if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api):
-                break
-            book_name_api = book_url_pair_api[0]
-            if book_name_api in book_dict_api:
-                pass
-            else:
-                count_api += 1
-                print("Crawling book #" + str(count_api))
-                book_url_api = book_url_pair_api[1]
-                book_spider_api = Spider.Spider(book_url_api)
-                book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
-
-                # since the network delay from asia to goodread.com is already very high,
-                # a small amount of time of sleep is used here.
-                time.sleep(0.2)
-
-    db.insert_dicts(book_dict_api, 0)
-    db.insert_dicts(author_dict_api, 1)
-    print("Scraping completed")
-
-
-print('please enter URL of the book')
-url = input()
-print('please enter the number of books you want to crawl')
-book_num = input()
-print('please enter the number of authors you want to crawl')
-author_num = input()
-print('Request received, start scraping')
-
-book_dict = {}
-author_dict = {}
-similar_book_dict = {}
-count = 1
-
-print("Crawling book #" + str(count))
-start_page = Spider.Spider(url)
-start_page.crawl(book_dict, author_dict, similar_book_dict)
-
-
-while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
-    all_related_books = list(similar_book_dict.items())
-    for book_url_pair in all_related_books:
-        if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
-            break
-        book_name = book_url_pair[0]
-        if book_name in book_dict:
-            pass
-        else:
-            count += 1
-            print("Crawling book #" + str(count))
-            book_url = book_url_pair[1]
-            book_spider = Spider.Spider(book_url)
-            book_spider.crawl(book_dict, author_dict, similar_book_dict)
-
-            # since the network delay from asia to goodread.com is already very high,
-            # a small amount of time of sleep is used here.
-            time.sleep(0.2)
-
-db.insert_dicts(book_dict, 0)
-db.insert_dicts(author_dict, 1)
-print("Scraping completed")
diff --git a/Crawler/Scrape.py b/Crawler/Scrape.py
new file mode 100644
index 0000000..a2f97ae
--- /dev/null
+++ b/Crawler/Scrape.py
@@ -0,0 +1,38 @@
+import Crawler.Spider as Spider
+import time
+from DataBase import mongoDB as db
+
+
+def scrape_api(url_api, book_num_api, author_num_api):
+    print('Request received, start scraping')
+    book_dict_api = {}
+    author_dict_api = {}
+    similar_book_dict_api = {}
+    count_api = 1
+
+    print("Crawling book #" + str(count_api))
+    start_page_api = Spider.Spider(url_api)
+    start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
+
+    while len(book_dict_api) < int(book_num_api) or len(author_dict_api) < int(author_num_api):
+        all_related_books_api = list(similar_book_dict_api.items())
+        for book_url_pair_api in all_related_books_api:
+            if len(book_dict_api) >= int(book_num_api) and len(author_dict_api) >= int(author_num_api):
+                break
+            book_name_api = book_url_pair_api[0]
+            if book_name_api in book_dict_api:
+                pass
+            else:
+                count_api += 1
+                print("Crawling book #" + str(count_api))
+                book_url_api = book_url_pair_api[1]
+                book_spider_api = Spider.Spider(book_url_api)
+                book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
+
+                # since the network delay from asia to goodread.com is already very high,
+                # a small amount of time of sleep is used here.
+                time.sleep(0.2)
+
+    db.insert_dicts(book_dict_api, 0)
+    db.insert_dicts(author_dict_api, 1)
+    print("Scraping completed")
\ No newline at end of file
diff --git a/Crawler/Spider.py b/Crawler/Spider.py
index a689789..bab2576 100644
--- a/Crawler/Spider.py
+++ b/Crawler/Spider.py
@@ -11,12 +11,21 @@ class Spider:
         self.soup = None
 
     def crawl(self, book_dict, author_dict, similar_book_dict):
+        """
+        function used to scrape data of a certain book
+        :param book_dict:
+        :param author_dict:
+        :param similar_book_dict:
+        :return: none
+        """
         header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                                 "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
 
         url = self.url
         res = requests.get(url, headers=header)
         self.soup = BeautifulSoup(res.text, "lxml")
+
+        # parsing data of current book
         titleAndAuthor = self.soup.title.string.split(" by ")
         title = titleAndAuthor[0]
         bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
@@ -36,6 +45,8 @@ class Spider:
         authorID = (author_page_link.split("show/")[1]).split(".")[0]
         see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
         res_similar = requests.get(see_more_link, headers=header)
+
+        # parsing similar books of current book
         similar_soup = BeautifulSoup(res_similar.text, 'lxml')
         similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
         similar_books = []
@@ -51,6 +62,8 @@ class Spider:
         bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
                                        review_count, imageURL, similar_books)
         book_dict[title] = bookPKT
+
+        # parse data of the author of current book
         res_author = requests.get(author_page_link, headers=header)
         author_soup = BeautifulSoup(res_author.text, 'lxml')
         author_rating = float(author_soup.find("span", {"class": "average"}).text)
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
index fc6f628..6743910 100644
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -58,7 +58,7 @@ def parse_query_to_json(pair):
     else:
         # can be A.B: C or A.B: "C"
         if re.search(":", pair):
-            if re.search("^[0-9.]*$", elements[1]):
+            if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
                 return {elements[0].split(".")[1]: float(elements[1])}
             else:
                 if re.search("\".*\"", elements[1]):
@@ -67,7 +67,7 @@ def parse_query_to_json(pair):
                     return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
         else:
             if re.search("NOT", pair):
-                if re.search("^[0-9.]*$", elements[1]):
+                if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
                     return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
                 else:
                     return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
index ae10ffb..8f34109 100644
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -1,6 +1,7 @@
 import DataBase.mongoDB as DataBase
 import RegularExpressionParser.Parser as Parser
 import re
+import Crawler.Scrape as Scrape
 from flask import Flask
 from flask import request
 from urllib.parse import urlparse, parse_qs
@@ -20,21 +21,18 @@ def data_base(collection):
         if collection == "book":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
-            # print(qs_parsed["id"])
             if qs_parsed == {}:
                 return DataBase.get_documents_json(0, {})
             return search_document(["book.id:" + qs_parsed["id"][0]])
         elif collection == "author":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
-            # print(type(qs_parsed["id"]))
             if qs_parsed == {}:
                 return DataBase.get_documents_json(1, {})
             return search_document(["author.id:" + qs_parsed["id"][0]])
         elif collection == "search":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
-            print(qs_parsed)
             return search_document(qs_parsed["q"][0].split("&"))
         else:
             return "404: Unknown Collection to GET"
@@ -47,7 +45,7 @@ def data_base(collection):
         elif collection == "author":
             opt = 1
         else:
-            return "404: Unknown Collection to GET"
+            return "404: Unknown Collection to PUT"
         DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
         return "200: PUT succeeded"
     elif request.method == "POST":
@@ -63,7 +61,15 @@ def data_base(collection):
         elif collection == "author":
             DataBase.insert_document(json_file, 1)
         elif collection == "scrape":
-            return "200"
+            # url_parsed = urlparse(request.url)
+            # qs_parsed = parse_qs(url_parsed.query)
+            # print(qs_parsed)
+            param = request.args.to_dict()
+            url = param["url"]
+            max_book = param["max_book"]
+            max_author = param["max_author"]
+            Scrape.scrape_api(url, max_book, max_author)
+            return "201: new data has been added to database"
         else:
             return "404: Unknown Collection to POST"
         return "201: POST succeeded"
@@ -81,6 +87,7 @@ def data_base(collection):
 
 
 def search_document(identifiers):
+    """ function used to find one or several document in database """
     if len(identifiers) == 1:
         json_idt = Parser.parse_query_to_json(identifiers[0])
         print(json_idt)
@@ -96,7 +103,7 @@ def search_document(identifiers):
             else:
                 opt = 0
         else:
-            if re.search("^book.*",identifiers[2]):
+            if re.search("^book.*", identifiers[2]):
                 print("Failed to find documentation: two statements are not pointing to the same collection")
                 return {}
             else:
-- 
GitLab


From 0b8727418b7a440321684b3c8d28ef9957d547d7 Mon Sep 17 00:00:00 2001
From: HenryShan <zshan2@illinois.edu>
Date: Tue, 9 Mar 2021 13:57:01 +0800
Subject: [PATCH 9/9] assignment2.1 final version

---
 Client/CLI.py                     |  8 ++++
 DataBase/mongoDB.py               | 20 +++++-----
 README.md                         | 15 +++++++-
 RegularExpressionParser/Parser.py | 48 +++++++++++------------
 Server/SimpleServer.py            | 37 ++++++++++--------
 Tests/DataBaseTests.py            | 53 +++++++++++++++++++++++++
 Tests/ParserTests.py              | 36 +++++++++++++++++
 Tests/ServerTests.py              | 64 +++++++++++++++++++++++++++++++
 Tests/testData.json               | 34 ++++++++++++++++
 9 files changed, 264 insertions(+), 51 deletions(-)
 create mode 100644 Tests/DataBaseTests.py
 create mode 100644 Tests/ParserTests.py
 create mode 100644 Tests/ServerTests.py
 create mode 100644 Tests/testData.json

diff --git a/Client/CLI.py b/Client/CLI.py
index 3bb87ef..071ffac 100644
--- a/Client/CLI.py
+++ b/Client/CLI.py
@@ -140,6 +140,14 @@ def put():
     else:
         print("Unknown command")
         back_to_main_page()
+    if re.search("books", query):
+        if len(data) <= 1:
+            print("For posting multiple books, please input json file has more than 1 books\n")
+            main_page()
+    else:
+        if len(data) > 1:
+            print("Cannot post more than one book with 'post book/author', please use 'post books/authors'")
+            main_page()
     result = API.put(query, data)
     typer.echo(result)
     print()
diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py
index 4095fa3..ef437fc 100644
--- a/DataBase/mongoDB.py
+++ b/DataBase/mongoDB.py
@@ -16,9 +16,9 @@ def get_db():
 def insert_document(docu, opt):
     db = get_db()
     if opt == 0:
-        records = db.books
+        records = db.test_books
     elif opt == 1:
-        records = db.authors
+        records = db.test_authors
     else:
         print("failed to get json file: wrong opt for selecting collection")
         return
@@ -34,9 +34,9 @@ def insert_dicts(dictionary, opt):
     """
     db = get_db()
     if opt == 0:
-        records = db.books
+        records = db.test_books
     elif opt == 1:
-        records = db.authors
+        records = db.test_authors
     else:
         print("failed to get json file: wrong opt for selecting collection")
         return
@@ -58,9 +58,9 @@ def update_dicts(opt, identifier, content):
     """
     db = get_db()
     if opt == 0:
-        records = db.books
+        records = db.test_books
     elif opt == 1:
-        records = db.authors
+        records = db.test_authors
     else:
         print("failed to get json file: wrong opt for selecting collection")
         return
@@ -82,9 +82,9 @@ def get_documents_json(opt, identifier):
     """
     db = get_db()
     if opt == 0:
-        records = db.books
+        records = db.test_books
     elif opt == 1:
-        records = db.authors
+        records = db.test_authors
     else:
         print("failed to get json file: wrong opt for selecting collection")
         return json.dumps({})
@@ -127,9 +127,9 @@ def clean(opt, identifier):
     """
     db = get_db()
     if opt == 0:
-        records = db.books
+        records = db.test_books
     elif opt == 1:
-        records = db.authors
+        records = db.test_authors
     else:
         print("failed to get json file: wrong opt for selecting collection")
         return
diff --git a/README.md b/README.md
index 4d0dd40..52026e1 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,14 @@
-## SP21-CS242 Assignment2
\ No newline at end of file
+## SP21-CS242 Assignment2: GoodReads Scraper
+This project is for sp21 CS242 assignment2
+
+Current version is 2.1
+
+###Function
+    1. Scrap data of books and authors from GoodReads
+    2. Store scraped data on cloud or local files
+    3. Query cloud for certain documents of books or authors
+    
+###Composition:
+####Client: Command line interface (interactive)
+####Server: Simple server based on Flask
+####Database: MongoDB
\ No newline at end of file
diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py
index 6743910..e361d2f 100644
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -15,27 +15,27 @@ def check_if_address_valid(address):
 
 
 def parse_query_to_url(query):
-    elements = re.findall("[0-9A-Za-z_\"><]+", query)
+    elements = re.findall("[0-9A-Za-z_\"><.]+", query)
     count = len(elements)
-    if count == 3:
+    if count == 2:
         # can only be A.B:C or wrong
-        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
+        if re.search("^[0-9a-zA-Z_.]+:[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + elements[1])
         else:
-            print("Invalid query.")
+            print("Invalid query1.")
             return ""
-    elif count == 4:
+    elif count == 3:
         # a pair and one of [NOT, >, <].
-        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
-        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
-        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
+        if re.search("^[0-9a-zA-Z_.]+:\s*NOT\s*[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + "NOT" + elements[2])
+        elif re.search("^[0-9a-zA-Z_.]+:\s*>\s*[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + ">" + elements[2])
+        elif re.search("^[0-9a-zA-Z_.]+:\s*<\s*[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + "<" + elements[2])
         else:
-            print("Invalid query.")
+            print("Invalid query2.")
             return ""
-    elif 6 <= count <= 8:
+    elif 5 <= count <= 7:
         # AND or OR operator
         if re.search(".*\sAND\s.*", query):
             parts = query.split(" AND ")
@@ -44,17 +44,22 @@ def parse_query_to_url(query):
             parts = query.split(" OR ")
             return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
         else:
-            print("Invalid query.")
+            print("Invalid query3.")
             return ""
 
 
 def parse_query_to_json(pair):
-    print(pair)
     elements = re.findall("[0-9A-Za-z\"_.]+", pair)
     count = len(elements)
-    if count != 2:
+    if count != 2 and count != 3:
         print("Failed to parse query: invalid args number")
         return {"wrong": "True"}            # will never be founded in database
+    elif count == 3:
+        # A.B: NOT C
+        if re.search("^[0-9.]*$", elements[2]) and not re.search("id", elements[0]):
+            return {elements[0].split(".")[1]: {"$ne": float(elements[2])}}
+        else:
+            return {elements[0].split(".")[1]: {"$not": {"$regex": elements[2], "$options": "i"}}}
     else:
         # can be A.B: C or A.B: "C"
         if re.search(":", pair):
@@ -66,12 +71,7 @@ def parse_query_to_json(pair):
                 else:
                     return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
         else:
-            if re.search("NOT", pair):
-                if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
-                    return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
-                else:
-                    return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
-            elif re.search(">", pair):
+            if re.search(">", pair):
                 return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
             elif re.search("<", pair):
                 return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
@@ -81,4 +81,4 @@ def parse_query_to_json(pair):
 
 
 def url_safe(element):
-    return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
+    return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C").replace("&", "%26").replace(":", "%3A")
diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py
index 8f34109..6507c32 100644
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -4,6 +4,8 @@ import re
 import Crawler.Scrape as Scrape
 from flask import Flask
 from flask import request
+from flask import abort
+from flask import jsonify
 from urllib.parse import urlparse, parse_qs
 
 app = Flask(__name__)
@@ -12,45 +14,51 @@ app = Flask(__name__)
 
 @app.route("/", methods=['GET'])
 def home():
+    """ homepage of server """
     return "200: successfully connected to home page\n"
 
 
 @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
 def data_base(collection):
+    """ data base page of server """
+    print("\n===============================\n")
+    print(collection)
+    print("\n===============================\n")
     if request.method == "GET":
         if collection == "book":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
             if qs_parsed == {}:
-                return DataBase.get_documents_json(0, {})
-            return search_document(["book.id:" + qs_parsed["id"][0]])
+                return jsonify(DataBase.get_documents_json(0, {}))
+            return jsonify(search_document(["book.id:" + qs_parsed["id"][0]]))
         elif collection == "author":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
             if qs_parsed == {}:
-                return DataBase.get_documents_json(1, {})
-            return search_document(["author.id:" + qs_parsed["id"][0]])
+                return jsonify(DataBase.get_documents_json(1, {}))
+            return jsonify(search_document(["author.id:" + qs_parsed["id"][0]]))
         elif collection == "search":
             url_parsed = urlparse(request.url)
             qs_parsed = parse_qs(url_parsed.query)
-            return search_document(qs_parsed["q"][0].split("&"))
+            result = jsonify(search_document(qs_parsed["q"][0].split("&")))
+            return jsonify(result)
         else:
-            return "404: Unknown Collection to GET"
+            abort(404)
     elif request.method == "PUT":
         if request.headers["Content-Type"] != "application/json":
-            return "415: content should be JSON file"
+            abort(415)
         json_update_info = request.json
         if collection == "book":
             opt = 0
         elif collection == "author":
             opt = 1
         else:
-            return "404: Unknown Collection to PUT"
+            abort(404)
         DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
         return "200: PUT succeeded"
     elif request.method == "POST":
         if request.headers["Content-Type"] != "application/json":
-            return "415: content should be JSON file"
+            abort(415, "content should be JSON file")
         json_file = request.json
         if collection == "books":
             DataBase.insert_dicts(json_file, 0)
@@ -61,18 +69,15 @@ def data_base(collection):
         elif collection == "author":
             DataBase.insert_document(json_file, 1)
         elif collection == "scrape":
-            # url_parsed = urlparse(request.url)
-            # qs_parsed = parse_qs(url_parsed.query)
-            # print(qs_parsed)
             param = request.args.to_dict()
             url = param["url"]
             max_book = param["max_book"]
             max_author = param["max_author"]
             Scrape.scrape_api(url, max_book, max_author)
-            return "201: new data has been added to database"
+            return "200: new data has been added to database"
         else:
-            return "404: Unknown Collection to POST"
-        return "201: POST succeeded"
+            abort(404)
+        return "200: POST succeeded"
     elif request.method == "DELETE":
         identifier = request.args.to_dict()
         print(identifier)
@@ -81,7 +86,7 @@ def data_base(collection):
         elif collection == "author":
             opt = 1
         else:
-            return "404: Unknown Collection to DELETE"
+            abort(404, "Unknown Collection to DELETE")
         DataBase.clean(opt, identifier)
         return "200: DELETE succeeded"
 
diff --git a/Tests/DataBaseTests.py b/Tests/DataBaseTests.py
new file mode 100644
index 0000000..0173f8d
--- /dev/null
+++ b/Tests/DataBaseTests.py
@@ -0,0 +1,53 @@
+import unittest
+import os
+import DataBase.mongoDB as db
+import json
+from pymongo import MongoClient
+from dotenv import load_dotenv
+
+
+class DataBaseTests(unittest.TestCase):
+
+    def setUp(self):
+        file_path = os.path.dirname(__file__) + r"\testData.json"
+        with open(file_path) as file:
+            self.test_data = json.load(file)
+        self.test_data1 = self.test_data["test_data1"]
+
+    def tearDown(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        record.delete_many({})
+
+    def test_upload(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        db.insert_document(self.test_data1, 0)
+        self.assertEqual(record.count_documents({}), 1)
+
+    def test_download(self):
+        db.insert_document(self.test_data1, 0)
+        json_file = json.loads(db.get_documents_json(0, {"title": "Becoming"}))
+        no_id_test_data1 = self.test_data1
+        no_id_test_data1.pop("_id")
+        self.assertEqual(json_file["books"][0], no_id_test_data1)
+
+    def test_update(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        db.insert_document(self.test_data1, 0)
+        db.update_dicts(0, {"title": "Becoming"}, {"rating_count": 1000000})
+        self.assertEqual(record.count_documents({"rating_count": self.test_data1["rating_count"]}), 0)
+        self.assertEqual(record.count_documents({"rating_count": 1000000}), 1)
+
+    def test_delete(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        db.insert_document(self.test_data1, 0)
+        db.clean(0, {})
+        self.assertEqual(record.count_documents({}), 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
+    # db.clean(0, {})
diff --git a/Tests/ParserTests.py b/Tests/ParserTests.py
new file mode 100644
index 0000000..a46915d
--- /dev/null
+++ b/Tests/ParserTests.py
@@ -0,0 +1,36 @@
+import RegularExpressionParser.Parser as Parser
+import unittest
+
+
+class DataBaseTests(unittest.TestCase):
+
+    def test_safe_url(self):
+        str_query = "search?q=book.rating_count<1000000&AND&book.rating>4.4"
+        expected_url = "http://127.0.0.1:5000/api/search?q=book.rating_count%3C1000000%26AND%26book.rating%3E4.4"
+        output_url = "http://127.0.0.1:5000/api/" + Parser.url_safe(str_query)
+        self.assertEqual(output_url, expected_url)
+
+    def test_valid_goodreads_url(self):
+        url = "https://www.goodreads.com/book/show/35133922-educated"
+        self.assertTrue(Parser.check_if_address_valid(url))
+
+    def test_invalid_goodreads_url(self):
+        url1 = "https://www.google.com/book/show/35133922-educated"
+        self.assertFalse(Parser.check_if_address_valid(url1))
+        url2 = "https://www.goodreads.com/book/show/over-35133922-educated"
+        self.assertFalse(Parser.check_if_address_valid(url2))
+        url3 = "https://www.goodreads.com/book/show/educated"
+        self.assertFalse(Parser.check_if_address_valid(url3))
+
+    def test_parse_query_to_url(self):
+        query = "book.id:12345678 AND book.rating: > 4.80"
+        result_url = Parser.parse_query_to_url(query)
+        expect_url = "book.id%3A12345678%26AND%26book.rating%3A%3E4.80"
+        self.assertEqual(result_url, expect_url)
+
+    def test_parse_query_to_json(self):
+        query = "book.id: NOT 12345678"
+        expect_json = {"id":  {"$not": {"$regex": "12345678", "$options": "i"}}}
+        output_json = Parser.parse_query_to_json(query)
+        self.assertEqual(output_json, expect_json)
+
diff --git a/Tests/ServerTests.py b/Tests/ServerTests.py
new file mode 100644
index 0000000..2bad5db
--- /dev/null
+++ b/Tests/ServerTests.py
@@ -0,0 +1,64 @@
+import unittest
+import os
+import json
+import DataBase.mongoDB as db
+import requests
+import Server.SimpleServer
+
+
+class DataBaseTests(unittest.TestCase):
+    def setUp(self):
+        file_path = os.path.dirname(__file__) + r"\testData.json"
+        with open(file_path) as file:
+            self.test_data = json.load(file)
+        self.test_data1 = self.test_data["test_data1"]
+
+    def tearDown(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        record.delete_many({})
+
+    def test_valid_get(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/book?id=38746485"
+        res = requests.get(url)
+        self.assertEqual(res.status_code, 200)
+
+    def test_search(self):
+        db.insert_document(self.test_data1, 0)
+        url1 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485"
+        res1 = requests.get(url1)
+        self.assertEqual(res1.status_code, 200)
+        url2 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485%26book.rating%3E4.90"
+        res2 = requests.get(url2)
+        self.assertEqual(res2.json(), {})
+
+    def test_invalid_get_not_exist(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/book?id=12345678"
+        res = requests.get(url)
+        self.assertEqual(res.status_code, 200)
+        self.assertEqual(res.json(), {'books': []})
+
+    def test_invalid_get_wrong_collection_name(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/bookssss?id=38746485"
+        res = requests.get(url)
+        self.assertEqual(res.status_code, 200)
+
+    def test_valid_put(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/book?id=38746485"
+        update_info = {"rating_count": 1000000}
+        res = requests.put(url, json=update_info)
+        self.assertEqual(res.status_code, 200)
+
+    def test_insert_put(self):
+        url = "http://127.0.0.1:5000/api/book?id=38746485"
+        update_info = {"rating_count": 1000000}
+        res = requests.put(url, json=update_info)
+        self.assertEqual(res.status_code, 200)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Tests/testData.json b/Tests/testData.json
new file mode 100644
index 0000000..5e7dc0b
--- /dev/null
+++ b/Tests/testData.json
@@ -0,0 +1,34 @@
+{
+  "test_data1": {"type": "book",
+                  "book_url": "https://www.goodreads.com/book/show/38746485-becoming",
+                  "title": "Becoming",
+                  "id": "38746485",
+                  "ISBN": "0",
+                  "author_url": "https://www.goodreads.com/author/show/2338628.Michelle_Obama",
+                  "author": "Michelle Obama",
+                  "rating": 4.52,
+                  "rating_count": 661305,
+                  "review_count": 54516,
+                  "image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1528206996l/38746485.jpg",
+                  "similar_books": ["Becoming",
+                                    "Educated",
+                                    "Born a Crime: Stories From a South African Childhood",
+                                    "More Than Love, A Husband's Tale",
+                                    "I Am Malala: The Story of the Girl Who Stood Up for Education and Was Shot by the Taliban",
+                                    "Untamed",
+                                    "Where the Crawdads Sing",
+                                    "A Promised Land",
+                                    "Dreams from My Father: A Story of Race and Inheritance",
+                                    "Drum Beats, Heart Beats (Dancing Soul Trilogy, #3)",
+                                    "Eat, Pray, Love",
+                                    "Losing Jon: A Teen's Tragic Death, a Police Cover-Up, a Community's Fight for Justice",
+                                    "I Know Why the Caged Bird Sings (Maya Angelou's Autobiography, #1)",
+                                    "The Vanishing Half",
+                                    "Self Made: Inspired by the Life of Madam C.J. Walker",
+                                    "Bossypants",
+                                    "The Audacity of Hope: Thoughts on Reclaiming the American Dream",
+                                    "Such a Fun Age",
+                                    "Between the World and Me",
+                                    "Black Fortunes: The Story of the First Six African Americans Who Escaped Slavery and Became Millionaires",
+                                    "Little Fires Everywhere"]}
+}
\ No newline at end of file
-- 
GitLab