diff --git a/API/Functions.py b/API/Functions.py index 96e48813432eadf140a939ce588ca9274da4f28b..452887c28cb692c6b2c5aaa99e5781ca7be73819 100644 --- a/API/Functions.py +++ b/API/Functions.py @@ -1,21 +1,49 @@ import requests import os -import RegularExpressionParser.Parser as parser -import re +import RegularExpressionParser.Parser as Parser from dotenv import load_dotenv +def safe_url(query): + load_dotenv() + host = os.getenv('SERVER_HOST') + return host + Parser.url_safe(query) + + def get(query): """ function used to send get request to find one or several documentations :param query: user input query :return: json file of the given author or book """ - load_dotenv() - host = os.getenv('SERVER_HOST') - url = host + parser.url_safe(query) + print("GET: " + query) + url = safe_url(query) req = requests.get(url) + print("response code: " + str(req.status_code)) return req.json() +def put(query, json_file): + print("PUT: " + query) + url = safe_url(query) + req = requests.put(url, json=json_file) + print("response code: " + str(req.status_code)) + return req.text + + +def post(query, json_file): + print("POST: " + query) + url = safe_url(query) + req = requests.put(url, json=json_file) + print("response code: " + str(req.status_code)) + return req.text + + +def delete(query): + print("DELETE: " + query) + url = safe_url(query) + req = requests.delete(url) + print("response code: " + str(req.status_code)) + return req.text + diff --git a/API/__init__.py b/API/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Client/CLI.py b/Client/CLI.py index b3c40d64a98850710c80d8b0671134ae417af5f6..810418314e586eb994cc56495b4c6b5205736f45 100644 --- a/Client/CLI.py +++ b/Client/CLI.py @@ -1,21 +1,186 @@ import typer +import RegularExpressionParser.Parser as Parser +import Crawler.Main as Main +import API.Functions as API +import os +import json +from dotenv import load_dotenv app = typer.Typer() + @app.command() -def scrape_book(url: str): - """ Function used to create scrape request to Server """ +def main_page(): + print("Welcome to use goodReads scraper!\n" + "This is main page\n" + "Usage:\n" + "scrape: scrape goodRead and load data to database\n" + "get: find a document for a book or an author from database\n" + "put: create or update a document in database\n" + "post: add one or multiple documents of book or author in database \n" + "delete: delete certain document from database\n" + "export: export a certain collection from database\n" + "exit: end the program") + order = input() + if order == "scrape": + scrape() + elif order == "get": + get() + elif order == "put": + put() + elif order == "post": + post() + elif order == "delete": + delete() + elif order == "export": + export() + elif order == "exit": + exit() + else: + print("Unknown command") + main_page() +def scrape(): + """ + Function used to create scrape request to Server + Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py + """ + print('please enter URL of the book') + url = input() + print('please enter the number of books you want to crawl') + book_num = input() + if not book_num.isdigit(): + print("max book num must be an integer") + main_page() + book_num = int(book_num) + print('please enter the number of authors you want to crawl') + author_num = input() + if not author_num.isdigit(): + print("max author num must be an integer") + main_page() + author_num = int(author_num) + print('Request received, start scraping') + if not Parser.check_if_address_valid(url): + print("Error: invalid URL") + main_page() + elif book_num >= 2000 or author_num >= 2000: + print("Error, the number of book and author should be lower than 2000") + main_page() + else: + if book_num > 200 or author_num > 50: + print("Warning, maximum of the number of books or authors is large") + Main.scrape_api(url, book_num, author_num) + main_page() -@app.command() -def goodbye(name: str, formal: bool = False): - """ function2 """ - if formal: - typer.echo(f"Goodbye Ms. {name}. Have a good day.") + +def get(): + print("Please enter your query:") + query = input() + result = API.get(query) + typer.echo(result) + main_page() + + +def put(): + print("Please enter your query:") + query = input() + print("Please select one way to load update info: input or read:") + load = input() + if load == "input": + print("Please enter json data:") + string = input() + try: + data = json.loads(string) + except ValueError as err: + print(err) + main_page() + elif load == "read": + load_dotenv() + file_root = os.getenv('FILE_ROOT') + print("Please enter the name of file which is in the file root directory:") + file_path = r"" + file_root + input() + if os.path.isfile(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except ValueError as err: + print(err) + main_page() + else: + print("Error: file not exist") + main_page() else: - typer.echo(f"Bye {name}!") + print("Unknown command") + main_page() + result = API.put(query, data) + typer.echo(result) + main_page() + + +def post(): + print("Please enter your query:") + query = input() + load_dotenv() + file_root = os.getenv('FILE_ROOT') + print("Please enter the name of file which is in the file root directory:") + file_path = r"" + file_root + input() + if os.path.isfile(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except ValueError as err: + print(err) + main_page() + else: + print("Error: file not exist") + main_page() + result = API.post(query, data) + typer.echo(result) + main_page() + + +def delete(): + print("Please enter your query:") + query = input() + result = API.delete(query) + typer.echo(result) + main_page() + + +def export(): + print("Which collection do you want to export? books or authors?") + collection = input() + if collection == "books": + json = API.get("book") + load_dotenv() + file_root = os.getenv('FILE_ROOT') + with open(file_root + "books" + ".json", "w") as output: + output.write(json) + print("books.json is stored at " + file_root) + main_page() + elif collection == "authors": + json = API.get("author") + load_dotenv() + file_root = os.getenv('FILE_ROOT') + with open(file_root + "authors" + ".json", "w") as output: + output.write(json) + print("authors.json is stored at " + file_root) + main_page() + elif collection == "back": + main_page() + else: + print("Unknown collection") + main_page() + + +# def goodbye(name: str, formal: bool = False): +# """ function2 """ +# if formal: +# typer.echo(f"Goodbye Ms. {name}. Have a good day.") +# else: +# typer.echo(f"Bye {name}!") if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/Client/__init__.py b/Client/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Crawler/Main.py b/Crawler/Main.py index 24a7e96d34f7d56cc387481802070ad3b71ff8e4..c778b1c285e4e64d357667f754999c8cfcf6866c 100644 --- a/Crawler/Main.py +++ b/Crawler/Main.py @@ -1,18 +1,50 @@ import Crawler.Spider as Spider import time +import RegularExpressionParser.Parser as Parser from DataBase import mongoDB as db +def scrape_api(url_api, book_num_api, author_num_api): + print('Request received, start scraping') + book_dict_api = {} + author_dict_api = {} + similar_book_dict_api = {} + count_api = 1 + + print("Crawling book #" + str(count_api)) + start_page_api = Spider.Spider(url_api) + start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api): + all_related_books_api = list(similar_book_dict_api.items()) + for book_url_pair_api in all_related_books_api: + if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api): + break + book_name_api = book_url_pair_api[0] + if book_name_api in book_dict_api: + pass + else: + count_api += 1 + print("Crawling book #" + str(count_api)) + book_url_api = book_url_pair_api[1] + book_spider_api = Spider.Spider(book_url_api) + book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + # since the network delay from asia to goodread.com is already very high, + # a small amount of time of sleep is used here. + time.sleep(0.2) + + db.insert_dicts(book_dict_api, 0) + db.insert_dicts(author_dict_api, 1) + print("Scraping completed") + + print('please enter URL of the book') url = input() -# url = https://www.goodreads.com/book/show/3735293-clean-code -# https://www.goodreads.com/book/show/35133922-educated print('please enter the number of books you want to crawl') book_num = input() -# bookNum = 3 print('please enter the number of authors you want to crawl') author_num = input() -# authorNum = 2 print('Request received, start scraping') book_dict = {} @@ -27,17 +59,17 @@ start_page.crawl(book_dict, author_dict, similar_book_dict) while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): all_related_books = list(similar_book_dict.items()) - for bookURLPair in all_related_books: + for book_url_pair in all_related_books: if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): break - book_name = bookURLPair[0] + book_name = book_url_pair[0] if book_name in book_dict: pass else: count += 1 print("Crawling book #" + str(count)) - bookURL = bookURLPair[1] - book_spider = Spider.Spider(bookURL) + book_url = book_url_pair[1] + book_spider = Spider.Spider(book_url) book_spider.crawl(book_dict, author_dict, similar_book_dict) # since the network delay from asia to goodread.com is already very high, diff --git a/Crawler/__init__.py b/Crawler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/DataBase/__init__.py b/DataBase/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py index c4e2222b6f663a70793886e2bb216541ce70370f..4095fa3a3c8fb662358b3c2a0a2d8d85fd9b7799 100644 --- a/DataBase/mongoDB.py +++ b/DataBase/mongoDB.py @@ -13,6 +13,18 @@ def get_db(): return client.get_database("crawler_db") +def insert_document(docu, opt): + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return + records.insert_one(docu) + + def insert_dicts(dictionary, opt): """ Insert books or authors collection in database @@ -49,9 +61,13 @@ def update_dicts(opt, identifier, content): records = db.books elif opt == 1: records = db.authors - result = records.update_many( + else: + print("failed to get json file: wrong opt for selecting collection") + return + result = records.update_one( identifier, {"$set": content}, + upsert=True ) print("matched documentation: " + str(result.matched_count)) print("modified documentation: " + str(result.modified_count)) @@ -96,8 +112,8 @@ def download_collection(opt, identifier, name): """ json_file = get_documents_json(opt, identifier) load_dotenv() - root = os.getenv('ROOT') - with open(root + name + ".json", "w") as output: + file_root = os.getenv('FILE_ROOT') + with open(file_root + name + ".json", "w") as output: output.write(json_file) diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index 27f4fe6f0d14f2a665d1ae7fc13c45d13082e6e1..fc6f628379503e62f14723758ca0fab49edd61f7 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -49,29 +49,35 @@ def parse_query_to_url(query): def parse_query_to_json(pair): - elements = re.findall("[0-9A-Za-z\"._]", pair) + print(pair) + elements = re.findall("[0-9A-Za-z\"_.]+", pair) count = len(elements) - print(count) - if count != 3 and count != 4: + if count != 2: print("Failed to parse query: invalid args number") return {"wrong": "True"} # will never be founded in database - elif count == 3: - # can be A.B: C or A.B: "C" - if re.search("\".*\"", elements[2]): - return {elements[0] + "." + elements[1]: elements[2]} - else: - return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}} else: - # can be NOT, >, or < - if elements[2] == "NOT": - return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}} - elif elements[2] == ">": - return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}} - elif elements[2] == "<": - return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}} + # can be A.B: C or A.B: "C" + if re.search(":", pair): + if re.search("^[0-9.]*$", elements[1]): + return {elements[0].split(".")[1]: float(elements[1])} + else: + if re.search("\".*\"", elements[1]): + return {elements[0].split(".")[1]: elements[1]} + else: + return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}} else: - print("Failed to parse query: unknown operator") - return {"wrong": "True"} + if re.search("NOT", pair): + if re.search("^[0-9.]*$", elements[1]): + return {elements[0].split(".")[1]: {"$ne": float(elements[1])}} + else: + return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}} + elif re.search(">", pair): + return {elements[0].split(".")[1]: {"$gt": float(elements[1])}} + elif re.search("<", pair): + return {elements[0].split(".")[1]: {"$lt": float(elements[1])}} + else: + print("Failed to parse query: unknown operator") + return {"wrong": "True"} def url_safe(element): diff --git a/RegularExpressionParser/__init__.py b/RegularExpressionParser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index c416ace4e9167991e8313b349a93d4f9f13b0cec..ae10ffbcd06ffe8a58c0750d7d4152f6f7615eb0 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -1,12 +1,12 @@ -import RegularExpressionParser.Parser as parser -import DataBase.mongoDB as db +import DataBase.mongoDB as DataBase +import RegularExpressionParser.Parser as Parser import re from flask import Flask from flask import request from urllib.parse import urlparse, parse_qs app = Flask(__name__) -app.config["DEBUG"] = True +# app.config["DEBUG"] = True @app.route("/", methods=['GET']) @@ -15,32 +15,79 @@ def home(): @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"]) -def data(collection): +def data_base(collection): if request.method == "GET": - if collection == "books" or collection == "book": - identifier = request.args.to_dict() - print(identifier) - print(type(identifier)) - return "200: find" - elif collection == "authors" or collection == "author": - print(request.url) - return "200: find" + if collection == "book": + url_parsed = urlparse(request.url) + qs_parsed = parse_qs(url_parsed.query) + # print(qs_parsed["id"]) + if qs_parsed == {}: + return DataBase.get_documents_json(0, {}) + return search_document(["book.id:" + qs_parsed["id"][0]]) + elif collection == "author": + url_parsed = urlparse(request.url) + qs_parsed = parse_qs(url_parsed.query) + # print(type(qs_parsed["id"])) + if qs_parsed == {}: + return DataBase.get_documents_json(1, {}) + return search_document(["author.id:" + qs_parsed["id"][0]]) elif collection == "search": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) + print(qs_parsed) return search_document(qs_parsed["q"][0].split("&")) else: - return "404 not found" - # elif request.method == "PUT": + return "404: Unknown Collection to GET" + elif request.method == "PUT": + if request.headers["Content-Type"] != "application/json": + return "415: content should be JSON file" + json_update_info = request.json + if collection == "book": + opt = 0 + elif collection == "author": + opt = 1 + else: + return "404: Unknown Collection to GET" + DataBase.update_dicts(opt, request.args.to_dict(), json_update_info) + return "200: PUT succeeded" + elif request.method == "POST": + if request.headers["Content-Type"] != "application/json": + return "415: content should be JSON file" + json_file = request.json + if collection == "books": + DataBase.insert_dicts(json_file, 0) + elif collection == "authors": + DataBase.insert_dicts(json_file, 1) + elif collection == "book": + DataBase.insert_document(json_file, 0) + elif collection == "author": + DataBase.insert_document(json_file, 1) + elif collection == "scrape": + return "200" + else: + return "404: Unknown Collection to POST" + return "201: POST succeeded" + elif request.method == "DELETE": + identifier = request.args.to_dict() + print(identifier) + if collection == "book": + opt = 0 + elif collection == "author": + opt = 1 + else: + return "404: Unknown Collection to DELETE" + DataBase.clean(opt, identifier) + return "200: DELETE succeeded" def search_document(identifiers): if len(identifiers) == 1: - json_idt = parser.parse_query_to_json(identifiers[0]) + json_idt = Parser.parse_query_to_json(identifiers[0]) + print(json_idt) if re.search("^book.*", identifiers[0]): - return db.get_documents_json(0, json_idt) + return DataBase.get_documents_json(0, json_idt) else: - return db.get_documents_json(1, json_idt) + return DataBase.get_documents_json(1, json_idt) elif len(identifiers) == 3: if re.search("^book.*", identifiers[0]): if re.search("^author.*", identifiers[2]): @@ -54,16 +101,18 @@ def search_document(identifiers): return {} else: opt = 1 - json_idt1 = parser.parse_query_to_json(identifiers[0]) - json_idt2 = parser.parse_query_to_json(identifiers[2]) + json_idt1 = Parser.parse_query_to_json(identifiers[0]) + json_idt2 = Parser.parse_query_to_json(identifiers[2]) if identifiers[1] == "AND": exp = {"$and": [json_idt1, json_idt2]} elif identifiers[1] == "OR": - exp = {"$or": [json_idt1,json_idt2]} + exp = {"$or": [json_idt1, json_idt2]} else: print("Failed to parse query: unknown operator for identifiers[1]") return {} - return db.get_documents_json(opt, exp) + print("exp:") + print(exp) + return DataBase.get_documents_json(opt, exp) else: return "Error, unknown identifiers" diff --git a/Server/__init__.py b/Server/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391