From 0b8727418b7a440321684b3c8d28ef9957d547d7 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 9 Mar 2021 13:57:01 +0800 Subject: [PATCH] assignment2.1 final version --- Client/CLI.py | 8 ++++ DataBase/mongoDB.py | 20 +++++----- README.md | 15 +++++++- RegularExpressionParser/Parser.py | 48 +++++++++++------------ Server/SimpleServer.py | 37 ++++++++++-------- Tests/DataBaseTests.py | 53 +++++++++++++++++++++++++ Tests/ParserTests.py | 36 +++++++++++++++++ Tests/ServerTests.py | 64 +++++++++++++++++++++++++++++++ Tests/testData.json | 34 ++++++++++++++++ 9 files changed, 264 insertions(+), 51 deletions(-) create mode 100644 Tests/DataBaseTests.py create mode 100644 Tests/ParserTests.py create mode 100644 Tests/ServerTests.py create mode 100644 Tests/testData.json diff --git a/Client/CLI.py b/Client/CLI.py index 3bb87ef..071ffac 100644 --- a/Client/CLI.py +++ b/Client/CLI.py @@ -140,6 +140,14 @@ def put(): else: print("Unknown command") back_to_main_page() + if re.search("books", query): + if len(data) <= 1: + print("For posting multiple books, please input json file has more than 1 books\n") + main_page() + else: + if len(data) > 1: + print("Cannot post more than one book with 'post book/author', please use 'post books/authors'") + main_page() result = API.put(query, data) typer.echo(result) print() diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py index 4095fa3..ef437fc 100644 --- a/DataBase/mongoDB.py +++ b/DataBase/mongoDB.py @@ -16,9 +16,9 @@ def get_db(): def insert_document(docu, opt): db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return @@ -34,9 +34,9 @@ def insert_dicts(dictionary, opt): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return @@ -58,9 +58,9 @@ def update_dicts(opt, identifier, content): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return @@ -82,9 +82,9 @@ def get_documents_json(opt, identifier): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return json.dumps({}) @@ -127,9 +127,9 @@ def clean(opt, identifier): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return diff --git a/README.md b/README.md index 4d0dd40..52026e1 100644 --- a/README.md +++ b/README.md @@ -1 +1,14 @@ -## SP21-CS242 Assignment2 \ No newline at end of file +## SP21-CS242 Assignment2: GoodReads Scraper +This project is for sp21 CS242 assignment2 + +Current version is 2.1 + +###Function + 1. Scrap data of books and authors from GoodReads + 2. Store scraped data on cloud or local files + 3. Query cloud for certain documents of books or authors + +###Composition: +####Client: Command line interface (interactive) +####Server: Simple server based on Flask +####Database: MongoDB \ No newline at end of file diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index 6743910..e361d2f 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -15,27 +15,27 @@ def check_if_address_valid(address): def parse_query_to_url(query): - elements = re.findall("[0-9A-Za-z_\"><]+", query) + elements = re.findall("[0-9A-Za-z_\"><.]+", query) count = len(elements) - if count == 3: + if count == 2: # can only be A.B:C or wrong - if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2]) + if re.search("^[0-9a-zA-Z_.]+:[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + elements[1]) else: - print("Invalid query.") + print("Invalid query1.") return "" - elif count == 4: + elif count == 3: # a pair and one of [NOT, >, <]. - if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2]) - elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2]) - elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2]) + if re.search("^[0-9a-zA-Z_.]+:\s*NOT\s*[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + "NOT" + elements[2]) + elif re.search("^[0-9a-zA-Z_.]+:\s*>\s*[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + ">" + elements[2]) + elif re.search("^[0-9a-zA-Z_.]+:\s*<\s*[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + "<" + elements[2]) else: - print("Invalid query.") + print("Invalid query2.") return "" - elif 6 <= count <= 8: + elif 5 <= count <= 7: # AND or OR operator if re.search(".*\sAND\s.*", query): parts = query.split(" AND ") @@ -44,17 +44,22 @@ def parse_query_to_url(query): parts = query.split(" OR ") return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1]) else: - print("Invalid query.") + print("Invalid query3.") return "" def parse_query_to_json(pair): - print(pair) elements = re.findall("[0-9A-Za-z\"_.]+", pair) count = len(elements) - if count != 2: + if count != 2 and count != 3: print("Failed to parse query: invalid args number") return {"wrong": "True"} # will never be founded in database + elif count == 3: + # A.B: NOT C + if re.search("^[0-9.]*$", elements[2]) and not re.search("id", elements[0]): + return {elements[0].split(".")[1]: {"$ne": float(elements[2])}} + else: + return {elements[0].split(".")[1]: {"$not": {"$regex": elements[2], "$options": "i"}}} else: # can be A.B: C or A.B: "C" if re.search(":", pair): @@ -66,12 +71,7 @@ def parse_query_to_json(pair): else: return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}} else: - if re.search("NOT", pair): - if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]): - return {elements[0].split(".")[1]: {"$ne": float(elements[1])}} - else: - return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}} - elif re.search(">", pair): + if re.search(">", pair): return {elements[0].split(".")[1]: {"$gt": float(elements[1])}} elif re.search("<", pair): return {elements[0].split(".")[1]: {"$lt": float(elements[1])}} @@ -81,4 +81,4 @@ def parse_query_to_json(pair): def url_safe(element): - return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C") \ No newline at end of file + return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C").replace("&", "%26").replace(":", "%3A") diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index 8f34109..6507c32 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -4,6 +4,8 @@ import re import Crawler.Scrape as Scrape from flask import Flask from flask import request +from flask import abort +from flask import jsonify from urllib.parse import urlparse, parse_qs app = Flask(__name__) @@ -12,45 +14,51 @@ app = Flask(__name__) @app.route("/", methods=['GET']) def home(): + """ homepage of server """ return "200: successfully connected to home page\n" @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"]) def data_base(collection): + """ data base page of server """ + print("\n===============================\n") + print(collection) + print("\n===============================\n") if request.method == "GET": if collection == "book": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) if qs_parsed == {}: - return DataBase.get_documents_json(0, {}) - return search_document(["book.id:" + qs_parsed["id"][0]]) + return jsonify(DataBase.get_documents_json(0, {})) + return jsonify(search_document(["book.id:" + qs_parsed["id"][0]])) elif collection == "author": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) if qs_parsed == {}: - return DataBase.get_documents_json(1, {}) - return search_document(["author.id:" + qs_parsed["id"][0]]) + return jsonify(DataBase.get_documents_json(1, {})) + return jsonify(search_document(["author.id:" + qs_parsed["id"][0]])) elif collection == "search": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - return search_document(qs_parsed["q"][0].split("&")) + result = jsonify(search_document(qs_parsed["q"][0].split("&"))) + return jsonify(result) else: - return "404: Unknown Collection to GET" + abort(404) elif request.method == "PUT": if request.headers["Content-Type"] != "application/json": - return "415: content should be JSON file" + abort(415) json_update_info = request.json if collection == "book": opt = 0 elif collection == "author": opt = 1 else: - return "404: Unknown Collection to PUT" + abort(404) DataBase.update_dicts(opt, request.args.to_dict(), json_update_info) return "200: PUT succeeded" elif request.method == "POST": if request.headers["Content-Type"] != "application/json": - return "415: content should be JSON file" + abort(415, "content should be JSON file") json_file = request.json if collection == "books": DataBase.insert_dicts(json_file, 0) @@ -61,18 +69,15 @@ def data_base(collection): elif collection == "author": DataBase.insert_document(json_file, 1) elif collection == "scrape": - # url_parsed = urlparse(request.url) - # qs_parsed = parse_qs(url_parsed.query) - # print(qs_parsed) param = request.args.to_dict() url = param["url"] max_book = param["max_book"] max_author = param["max_author"] Scrape.scrape_api(url, max_book, max_author) - return "201: new data has been added to database" + return "200: new data has been added to database" else: - return "404: Unknown Collection to POST" - return "201: POST succeeded" + abort(404) + return "200: POST succeeded" elif request.method == "DELETE": identifier = request.args.to_dict() print(identifier) @@ -81,7 +86,7 @@ def data_base(collection): elif collection == "author": opt = 1 else: - return "404: Unknown Collection to DELETE" + abort(404, "Unknown Collection to DELETE") DataBase.clean(opt, identifier) return "200: DELETE succeeded" diff --git a/Tests/DataBaseTests.py b/Tests/DataBaseTests.py new file mode 100644 index 0000000..0173f8d --- /dev/null +++ b/Tests/DataBaseTests.py @@ -0,0 +1,53 @@ +import unittest +import os +import DataBase.mongoDB as db +import json +from pymongo import MongoClient +from dotenv import load_dotenv + + +class DataBaseTests(unittest.TestCase): + + def setUp(self): + file_path = os.path.dirname(__file__) + r"\testData.json" + with open(file_path) as file: + self.test_data = json.load(file) + self.test_data1 = self.test_data["test_data1"] + + def tearDown(self): + data_base = db.get_db() + record = data_base.test_books + record.delete_many({}) + + def test_upload(self): + data_base = db.get_db() + record = data_base.test_books + db.insert_document(self.test_data1, 0) + self.assertEqual(record.count_documents({}), 1) + + def test_download(self): + db.insert_document(self.test_data1, 0) + json_file = json.loads(db.get_documents_json(0, {"title": "Becoming"})) + no_id_test_data1 = self.test_data1 + no_id_test_data1.pop("_id") + self.assertEqual(json_file["books"][0], no_id_test_data1) + + def test_update(self): + data_base = db.get_db() + record = data_base.test_books + db.insert_document(self.test_data1, 0) + db.update_dicts(0, {"title": "Becoming"}, {"rating_count": 1000000}) + self.assertEqual(record.count_documents({"rating_count": self.test_data1["rating_count"]}), 0) + self.assertEqual(record.count_documents({"rating_count": 1000000}), 1) + + def test_delete(self): + data_base = db.get_db() + record = data_base.test_books + db.insert_document(self.test_data1, 0) + db.clean(0, {}) + self.assertEqual(record.count_documents({}), 0) + + +if __name__ == '__main__': + unittest.main() + # db.clean(0, {}) diff --git a/Tests/ParserTests.py b/Tests/ParserTests.py new file mode 100644 index 0000000..a46915d --- /dev/null +++ b/Tests/ParserTests.py @@ -0,0 +1,36 @@ +import RegularExpressionParser.Parser as Parser +import unittest + + +class DataBaseTests(unittest.TestCase): + + def test_safe_url(self): + str_query = "search?q=book.rating_count<1000000&AND&book.rating>4.4" + expected_url = "http://127.0.0.1:5000/api/search?q=book.rating_count%3C1000000%26AND%26book.rating%3E4.4" + output_url = "http://127.0.0.1:5000/api/" + Parser.url_safe(str_query) + self.assertEqual(output_url, expected_url) + + def test_valid_goodreads_url(self): + url = "https://www.goodreads.com/book/show/35133922-educated" + self.assertTrue(Parser.check_if_address_valid(url)) + + def test_invalid_goodreads_url(self): + url1 = "https://www.google.com/book/show/35133922-educated" + self.assertFalse(Parser.check_if_address_valid(url1)) + url2 = "https://www.goodreads.com/book/show/over-35133922-educated" + self.assertFalse(Parser.check_if_address_valid(url2)) + url3 = "https://www.goodreads.com/book/show/educated" + self.assertFalse(Parser.check_if_address_valid(url3)) + + def test_parse_query_to_url(self): + query = "book.id:12345678 AND book.rating: > 4.80" + result_url = Parser.parse_query_to_url(query) + expect_url = "book.id%3A12345678%26AND%26book.rating%3A%3E4.80" + self.assertEqual(result_url, expect_url) + + def test_parse_query_to_json(self): + query = "book.id: NOT 12345678" + expect_json = {"id": {"$not": {"$regex": "12345678", "$options": "i"}}} + output_json = Parser.parse_query_to_json(query) + self.assertEqual(output_json, expect_json) + diff --git a/Tests/ServerTests.py b/Tests/ServerTests.py new file mode 100644 index 0000000..2bad5db --- /dev/null +++ b/Tests/ServerTests.py @@ -0,0 +1,64 @@ +import unittest +import os +import json +import DataBase.mongoDB as db +import requests +import Server.SimpleServer + + +class DataBaseTests(unittest.TestCase): + def setUp(self): + file_path = os.path.dirname(__file__) + r"\testData.json" + with open(file_path) as file: + self.test_data = json.load(file) + self.test_data1 = self.test_data["test_data1"] + + def tearDown(self): + data_base = db.get_db() + record = data_base.test_books + record.delete_many({}) + + def test_valid_get(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/book?id=38746485" + res = requests.get(url) + self.assertEqual(res.status_code, 200) + + def test_search(self): + db.insert_document(self.test_data1, 0) + url1 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485" + res1 = requests.get(url1) + self.assertEqual(res1.status_code, 200) + url2 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485%26book.rating%3E4.90" + res2 = requests.get(url2) + self.assertEqual(res2.json(), {}) + + def test_invalid_get_not_exist(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/book?id=12345678" + res = requests.get(url) + self.assertEqual(res.status_code, 200) + self.assertEqual(res.json(), {'books': []}) + + def test_invalid_get_wrong_collection_name(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/bookssss?id=38746485" + res = requests.get(url) + self.assertEqual(res.status_code, 200) + + def test_valid_put(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/book?id=38746485" + update_info = {"rating_count": 1000000} + res = requests.put(url, json=update_info) + self.assertEqual(res.status_code, 200) + + def test_insert_put(self): + url = "http://127.0.0.1:5000/api/book?id=38746485" + update_info = {"rating_count": 1000000} + res = requests.put(url, json=update_info) + self.assertEqual(res.status_code, 200) + + +if __name__ == '__main__': + unittest.main() diff --git a/Tests/testData.json b/Tests/testData.json new file mode 100644 index 0000000..5e7dc0b --- /dev/null +++ b/Tests/testData.json @@ -0,0 +1,34 @@ +{ + "test_data1": {"type": "book", + "book_url": "https://www.goodreads.com/book/show/38746485-becoming", + "title": "Becoming", + "id": "38746485", + "ISBN": "0", + "author_url": "https://www.goodreads.com/author/show/2338628.Michelle_Obama", + "author": "Michelle Obama", + "rating": 4.52, + "rating_count": 661305, + "review_count": 54516, + "image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1528206996l/38746485.jpg", + "similar_books": ["Becoming", + "Educated", + "Born a Crime: Stories From a South African Childhood", + "More Than Love, A Husband's Tale", + "I Am Malala: The Story of the Girl Who Stood Up for Education and Was Shot by the Taliban", + "Untamed", + "Where the Crawdads Sing", + "A Promised Land", + "Dreams from My Father: A Story of Race and Inheritance", + "Drum Beats, Heart Beats (Dancing Soul Trilogy, #3)", + "Eat, Pray, Love", + "Losing Jon: A Teen's Tragic Death, a Police Cover-Up, a Community's Fight for Justice", + "I Know Why the Caged Bird Sings (Maya Angelou's Autobiography, #1)", + "The Vanishing Half", + "Self Made: Inspired by the Life of Madam C.J. Walker", + "Bossypants", + "The Audacity of Hope: Thoughts on Reclaiming the American Dream", + "Such a Fun Age", + "Between the World and Me", + "Black Fortunes: The Story of the First Six African Americans Who Escaped Slavery and Became Millionaires", + "Little Fires Everywhere"]} +} \ No newline at end of file -- GitLab