assignment2.1 final version

0b872741 · zshan2 · 155ae6b1 · 0b872741 · 0b872741 · 0b872741
Commit 0b872741 authored 4 years ago by zshan2
--- a/Client/CLI.py
+++ b/Client/CLI.py
@@ -140,6 +140,14 @@ def put():
    else:
        print("Unknown command")
        back_to_main_page()
+    if re.search("books", query):
+        if len(data) <= 1:
+            print("For posting multiple books, please input json file has more than 1 books\n")
+            main_page()
+    else:
+        if len(data) > 1:
+            print("Cannot post more than one book with 'post book/author', please use 'post books/authors'")
+            main_page()
    result = API.put(query, data)
    typer.echo(result)
    print()

--- a/DataBase/mongoDB.py
+++ b/DataBase/mongoDB.py
@@ -16,9 +16,9 @@ def get_db():
 def insert_document(docu, opt):
    db = get_db()
    if opt == 0:
-        records = db.books
+        records = db.test_books
    elif opt == 1:
-        records = db.authors
+        records = db.test_authors
    else:
        print("failed to get json file: wrong opt for selecting collection")
        return
@@ -34,9 +34,9 @@ def insert_dicts(dictionary, opt):
    """
    db = get_db()
    if opt == 0:
-        records = db.books
+        records = db.test_books
    elif opt == 1:
-        records = db.authors
+        records = db.test_authors
    else:
        print("failed to get json file: wrong opt for selecting collection")
        return
@@ -58,9 +58,9 @@ def update_dicts(opt, identifier, content):
    """
    db = get_db()
    if opt == 0:
-        records = db.books
+        records = db.test_books
    elif opt == 1:
-        records = db.authors
+        records = db.test_authors
    else:
        print("failed to get json file: wrong opt for selecting collection")
        return
@@ -82,9 +82,9 @@ def get_documents_json(opt, identifier):
    """
    db = get_db()
    if opt == 0:
-        records = db.books
+        records = db.test_books
    elif opt == 1:
-        records = db.authors
+        records = db.test_authors
    else:
        print("failed to get json file: wrong opt for selecting collection")
        return json.dumps({})
@@ -127,9 +127,9 @@ def clean(opt, identifier):
    """
    db = get_db()
    if opt == 0:
-        records = db.books
+        records = db.test_books
    elif opt == 1:
-        records = db.authors
+        records = db.test_authors
    else:
        print("failed to get json file: wrong opt for selecting collection")
        return

--- a/README.md
+++ b/README.md
-## SP21-CS242 Assignment2
\ No newline at end of file
+## SP21-CS242 Assignment2: GoodReads Scraper
+This project is for sp21 CS242 assignment2
+
+Current version is 2.1
+
+###Function
+    1. Scrap data of books and authors from GoodReads
+    2. Store scraped data on cloud or local files
+    3. Query cloud for certain documents of books or authors
+    
+###Composition:
+####Client: Command line interface (interactive)
+####Server: Simple server based on Flask
+####Database: MongoDB
\ No newline at end of file
--- a/RegularExpressionParser/Parser.py
+++ b/RegularExpressionParser/Parser.py
@@ -15,27 +15,27 @@ def check_if_address_valid(address):


 def parse_query_to_url(query):
-    elements = re.findall("[0-9A-Za-z_\"><]+", query)
+    elements = re.findall("[0-9A-Za-z_\"><.]+", query)
    count = len(elements)
-    if count == 3:
+    if count == 2:
        # can only be A.B:C or wrong
-        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
+        if re.search("^[0-9a-zA-Z_.]+:[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + elements[1])
        else:
-            print("Invalid query.")
+            print("Invalid query1.")
            return ""
-    elif count == 4:
+    elif count == 3:
        # a pair and one of [NOT, >, <].
-        if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
-        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
-        elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
-            return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
+        if re.search("^[0-9a-zA-Z_.]+:\s*NOT\s*[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + "NOT" + elements[2])
+        elif re.search("^[0-9a-zA-Z_.]+:\s*>\s*[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + ">" + elements[2])
+        elif re.search("^[0-9a-zA-Z_.]+:\s*<\s*[0-9a-zA-Z_\".]", query):
+            return url_safe(elements[0] + "%3A" + "<" + elements[2])
        else:
-            print("Invalid query.")
+            print("Invalid query2.")
            return ""
-    elif 6 <= count <= 8:
+    elif 5 <= count <= 7:
        # AND or OR operator
        if re.search(".*\sAND\s.*", query):
            parts = query.split(" AND ")
@@ -44,17 +44,22 @@ def parse_query_to_url(query):
            parts = query.split(" OR ")
            return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
        else:
-            print("Invalid query.")
+            print("Invalid query3.")
            return ""


 def parse_query_to_json(pair):
-    print(pair)
    elements = re.findall("[0-9A-Za-z\"_.]+", pair)
    count = len(elements)
-    if count != 2:
+    if count != 2 and count != 3:
        print("Failed to parse query: invalid args number")
        return {"wrong": "True"}            # will never be founded in database
+    elif count == 3:
+        # A.B: NOT C
+        if re.search("^[0-9.]*$", elements[2]) and not re.search("id", elements[0]):
+            return {elements[0].split(".")[1]: {"$ne": float(elements[2])}}
+        else:
+            return {elements[0].split(".")[1]: {"$not": {"$regex": elements[2], "$options": "i"}}}
    else:
        # can be A.B: C or A.B: "C"
        if re.search(":", pair):
@@ -66,12 +71,7 @@ def parse_query_to_json(pair):
                else:
                    return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
        else:
-            if re.search("NOT", pair):
-                if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
-                    return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
-                else:
-                    return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
-            elif re.search(">", pair):
+            if re.search(">", pair):
                return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
            elif re.search("<", pair):
                return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
@@ -81,4 +81,4 @@ def parse_query_to_json(pair):


 def url_safe(element):
-    return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
+    return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C").replace("&", "%26").replace(":", "%3A")
--- a/Server/SimpleServer.py
+++ b/Server/SimpleServer.py
@@ -4,6 +4,8 @@ import re
 import Crawler.Scrape as Scrape
 from flask import Flask
 from flask import request
+from flask import abort
+from flask import jsonify
 from urllib.parse import urlparse, parse_qs

 app = Flask(__name__)
@@ -12,45 +14,51 @@ app = Flask(__name__)

 @app.route("/", methods=['GET'])
 def home():
+    """ homepage of server """
    return "200: successfully connected to home page\n"


 @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
 def data_base(collection):
+    """ data base page of server """
+    print("\n===============================\n")
+    print(collection)
+    print("\n===============================\n")
    if request.method == "GET":
        if collection == "book":
            url_parsed = urlparse(request.url)
            qs_parsed = parse_qs(url_parsed.query)
            if qs_parsed == {}:
-                return DataBase.get_documents_json(0, {})
-            return search_document(["book.id:" + qs_parsed["id"][0]])
+                return jsonify(DataBase.get_documents_json(0, {}))
+            return jsonify(search_document(["book.id:" + qs_parsed["id"][0]]))
        elif collection == "author":
            url_parsed = urlparse(request.url)
            qs_parsed = parse_qs(url_parsed.query)
            if qs_parsed == {}:
-                return DataBase.get_documents_json(1, {})
-            return search_document(["author.id:" + qs_parsed["id"][0]])
+                return jsonify(DataBase.get_documents_json(1, {}))
+            return jsonify(search_document(["author.id:" + qs_parsed["id"][0]]))
        elif collection == "search":
            url_parsed = urlparse(request.url)
            qs_parsed = parse_qs(url_parsed.query)
-            return search_document(qs_parsed["q"][0].split("&"))
+            result = jsonify(search_document(qs_parsed["q"][0].split("&")))
+            return jsonify(result)
        else:
-            return "404: Unknown Collection to GET"
+            abort(404)
    elif request.method == "PUT":
        if request.headers["Content-Type"] != "application/json":
-            return "415: content should be JSON file"
+            abort(415)
        json_update_info = request.json
        if collection == "book":
            opt = 0
        elif collection == "author":
            opt = 1
        else:
-            return "404: Unknown Collection to PUT"
+            abort(404)
        DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
        return "200: PUT succeeded"
    elif request.method == "POST":
        if request.headers["Content-Type"] != "application/json":
-            return "415: content should be JSON file"
+            abort(415, "content should be JSON file")
        json_file = request.json
        if collection == "books":
            DataBase.insert_dicts(json_file, 0)
@@ -61,18 +69,15 @@ def data_base(collection):
        elif collection == "author":
            DataBase.insert_document(json_file, 1)
        elif collection == "scrape":
-            # url_parsed = urlparse(request.url)
-            # qs_parsed = parse_qs(url_parsed.query)
-            # print(qs_parsed)
            param = request.args.to_dict()
            url = param["url"]
            max_book = param["max_book"]
            max_author = param["max_author"]
            Scrape.scrape_api(url, max_book, max_author)
-            return "201: new data has been added to database"
+            return "200: new data has been added to database"
        else:
-            return "404: Unknown Collection to POST"
-        return "201: POST succeeded"
+            abort(404)
+        return "200: POST succeeded"
    elif request.method == "DELETE":
        identifier = request.args.to_dict()
        print(identifier)
@@ -81,7 +86,7 @@ def data_base(collection):
        elif collection == "author":
            opt = 1
        else:
-            return "404: Unknown Collection to DELETE"
+            abort(404, "Unknown Collection to DELETE")
        DataBase.clean(opt, identifier)
        return "200: DELETE succeeded"


--- a/Tests/DataBaseTests.py
+++ b/Tests/DataBaseTests.py
+import unittest
+import os
+import DataBase.mongoDB as db
+import json
+from pymongo import MongoClient
+from dotenv import load_dotenv
+
+
+class DataBaseTests(unittest.TestCase):
+
+    def setUp(self):
+        file_path = os.path.dirname(__file__) + r"\testData.json"
+        with open(file_path) as file:
+            self.test_data = json.load(file)
+        self.test_data1 = self.test_data["test_data1"]
+
+    def tearDown(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        record.delete_many({})
+
+    def test_upload(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        db.insert_document(self.test_data1, 0)
+        self.assertEqual(record.count_documents({}), 1)
+
+    def test_download(self):
+        db.insert_document(self.test_data1, 0)
+        json_file = json.loads(db.get_documents_json(0, {"title": "Becoming"}))
+        no_id_test_data1 = self.test_data1
+        no_id_test_data1.pop("_id")
+        self.assertEqual(json_file["books"][0], no_id_test_data1)
+
+    def test_update(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        db.insert_document(self.test_data1, 0)
+        db.update_dicts(0, {"title": "Becoming"}, {"rating_count": 1000000})
+        self.assertEqual(record.count_documents({"rating_count": self.test_data1["rating_count"]}), 0)
+        self.assertEqual(record.count_documents({"rating_count": 1000000}), 1)
+
+    def test_delete(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        db.insert_document(self.test_data1, 0)
+        db.clean(0, {})
+        self.assertEqual(record.count_documents({}), 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
+    # db.clean(0, {})
--- a/Tests/ParserTests.py
+++ b/Tests/ParserTests.py
+import RegularExpressionParser.Parser as Parser
+import unittest
+
+
+class DataBaseTests(unittest.TestCase):
+
+    def test_safe_url(self):
+        str_query = "search?q=book.rating_count<1000000&AND&book.rating>4.4"
+        expected_url = "http://127.0.0.1:5000/api/search?q=book.rating_count%3C1000000%26AND%26book.rating%3E4.4"
+        output_url = "http://127.0.0.1:5000/api/" + Parser.url_safe(str_query)
+        self.assertEqual(output_url, expected_url)
+
+    def test_valid_goodreads_url(self):
+        url = "https://www.goodreads.com/book/show/35133922-educated"
+        self.assertTrue(Parser.check_if_address_valid(url))
+
+    def test_invalid_goodreads_url(self):
+        url1 = "https://www.google.com/book/show/35133922-educated"
+        self.assertFalse(Parser.check_if_address_valid(url1))
+        url2 = "https://www.goodreads.com/book/show/over-35133922-educated"
+        self.assertFalse(Parser.check_if_address_valid(url2))
+        url3 = "https://www.goodreads.com/book/show/educated"
+        self.assertFalse(Parser.check_if_address_valid(url3))
+
+    def test_parse_query_to_url(self):
+        query = "book.id:12345678 AND book.rating: > 4.80"
+        result_url = Parser.parse_query_to_url(query)
+        expect_url = "book.id%3A12345678%26AND%26book.rating%3A%3E4.80"
+        self.assertEqual(result_url, expect_url)
+
+    def test_parse_query_to_json(self):
+        query = "book.id: NOT 12345678"
+        expect_json = {"id":  {"$not": {"$regex": "12345678", "$options": "i"}}}
+        output_json = Parser.parse_query_to_json(query)
+        self.assertEqual(output_json, expect_json)
+
--- a/Tests/ServerTests.py
+++ b/Tests/ServerTests.py
+import unittest
+import os
+import json
+import DataBase.mongoDB as db
+import requests
+import Server.SimpleServer
+
+
+class DataBaseTests(unittest.TestCase):
+    def setUp(self):
+        file_path = os.path.dirname(__file__) + r"\testData.json"
+        with open(file_path) as file:
+            self.test_data = json.load(file)
+        self.test_data1 = self.test_data["test_data1"]
+
+    def tearDown(self):
+        data_base = db.get_db()
+        record = data_base.test_books
+        record.delete_many({})
+
+    def test_valid_get(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/book?id=38746485"
+        res = requests.get(url)
+        self.assertEqual(res.status_code, 200)
+
+    def test_search(self):
+        db.insert_document(self.test_data1, 0)
+        url1 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485"
+        res1 = requests.get(url1)
+        self.assertEqual(res1.status_code, 200)
+        url2 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485%26book.rating%3E4.90"
+        res2 = requests.get(url2)
+        self.assertEqual(res2.json(), {})
+
+    def test_invalid_get_not_exist(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/book?id=12345678"
+        res = requests.get(url)
+        self.assertEqual(res.status_code, 200)
+        self.assertEqual(res.json(), {'books': []})
+
+    def test_invalid_get_wrong_collection_name(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/bookssss?id=38746485"
+        res = requests.get(url)
+        self.assertEqual(res.status_code, 200)
+
+    def test_valid_put(self):
+        db.insert_document(self.test_data1, 0)
+        url = "http://127.0.0.1:5000/api/book?id=38746485"
+        update_info = {"rating_count": 1000000}
+        res = requests.put(url, json=update_info)
+        self.assertEqual(res.status_code, 200)
+
+    def test_insert_put(self):
+        url = "http://127.0.0.1:5000/api/book?id=38746485"
+        update_info = {"rating_count": 1000000}
+        res = requests.put(url, json=update_info)
+        self.assertEqual(res.status_code, 200)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/Tests/testData.json
+++ b/Tests/testData.json
+{
+  "test_data1": {"type": "book",
+                  "book_url": "https://www.goodreads.com/book/show/38746485-becoming",
+                  "title": "Becoming",
+                  "id": "38746485",
+                  "ISBN": "0",
+                  "author_url": "https://www.goodreads.com/author/show/2338628.Michelle_Obama",
+                  "author": "Michelle Obama",
+                  "rating": 4.52,
+                  "rating_count": 661305,
+                  "review_count": 54516,
+                  "image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1528206996l/38746485.jpg",
+                  "similar_books": ["Becoming",
+                                    "Educated",
+                                    "Born a Crime: Stories From a South African Childhood",
+                                    "More Than Love, A Husband's Tale",
+                                    "I Am Malala: The Story of the Girl Who Stood Up for Education and Was Shot by the Taliban",
+                                    "Untamed",
+                                    "Where the Crawdads Sing",
+                                    "A Promised Land",
+                                    "Dreams from My Father: A Story of Race and Inheritance",
+                                    "Drum Beats, Heart Beats (Dancing Soul Trilogy, #3)",
+                                    "Eat, Pray, Love",
+                                    "Losing Jon: A Teen's Tragic Death, a Police Cover-Up, a Community's Fight for Justice",
+                                    "I Know Why the Caged Bird Sings (Maya Angelou's Autobiography, #1)",
+                                    "The Vanishing Half",
+                                    "Self Made: Inspired by the Life of Madam C.J. Walker",
+                                    "Bossypants",
+                                    "The Audacity of Hope: Thoughts on Reclaiming the American Dream",
+                                    "Such a Fun Age",
+                                    "Between the World and Me",
+                                    "Black Fortunes: The Story of the First Six African Americans Who Escaped Slavery and Became Millionaires",
+                                    "Little Fires Everywhere"]}
+}
\ No newline at end of file