diff --git a/API/Functions.py b/API/Functions.py index 452887c28cb692c6b2c5aaa99e5781ca7be73819..4858e26a571deb8d84f81efbeb0aeaebae4732ca 100644 --- a/API/Functions.py +++ b/API/Functions.py @@ -19,7 +19,7 @@ def get(query): print("GET: " + query) url = safe_url(query) req = requests.get(url) - print("response code: " + str(req.status_code)) + print("\nresponse code: " + str(req.status_code) + "\n") return req.json() @@ -27,15 +27,15 @@ def put(query, json_file): print("PUT: " + query) url = safe_url(query) req = requests.put(url, json=json_file) - print("response code: " + str(req.status_code)) + print("\nresponse code: " + str(req.status_code) + "\n") return req.text def post(query, json_file): print("POST: " + query) url = safe_url(query) - req = requests.put(url, json=json_file) - print("response code: " + str(req.status_code)) + req = requests.post(url, json=json_file) + print("\nresponse code: " + str(req.status_code) + "\n") return req.text @@ -43,7 +43,7 @@ def delete(query): print("DELETE: " + query) url = safe_url(query) req = requests.delete(url) - print("response code: " + str(req.status_code)) + print("\nresponse code: " + str(req.status_code) + "\n") return req.text diff --git a/Client/CLI.py b/Client/CLI.py index 810418314e586eb994cc56495b4c6b5205736f45..3bb87ef27a489a896e724d6c60fc0d2cb28976f7 100644 --- a/Client/CLI.py +++ b/Client/CLI.py @@ -1,9 +1,10 @@ import typer import RegularExpressionParser.Parser as Parser -import Crawler.Main as Main +import Crawler.Scrape as Scrape import API.Functions as API import os import json +import re from dotenv import load_dotenv app = typer.Typer() @@ -11,20 +12,24 @@ app = typer.Typer() @app.command() def main_page(): - print("Welcome to use goodReads scraper!\n" - "This is main page\n" - "Usage:\n" - "scrape: scrape goodRead and load data to database\n" - "get: find a document for a book or an author from database\n" - "put: create or update a document in database\n" - "post: add one or multiple documents of book or author in database \n" - "delete: delete certain document from database\n" - "export: export a certain collection from database\n" - "exit: end the program") + """ + main page of client, directly run this .py to enter and see usage + :return: none + """ + print("==================================================================\n\n" + "Welcome to use goodReads scraper!\n\n" + "This is main page\n\n" + "You can input back to return to this page at anytime\n\n" + "Usage:\n\n" + "get: find a document for a book or an author from database\n\n" + "put: create or update a document in database\n\n" + "post: add one or multiple documents of book or author in database \n\n" + "delete: delete certain document from database\n\n" + "export: export a certain collection from database\n\n" + "exit: end the program\n\n" + "Please enter your command:\n") order = input() - if order == "scrape": - scrape() - elif order == "get": + if order == "get": get() elif order == "put": put() @@ -34,152 +39,186 @@ def main_page(): delete() elif order == "export": export() - elif order == "exit": + if order == "exit": exit() else: print("Unknown command") main_page() -def scrape(): +def check_and_back(command): + """ check if used in put back and return to main page if so """ + if command == "back": + main_page() + + +def back_to_main_page(): + """ wait for user to press enter and then return to the main page """ + print("press enter to return to main page") + no_use = input() + main_page() + + +def scrape(query): """ Function used to create scrape request to Server - Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py """ - print('please enter URL of the book') - url = input() - print('please enter the number of books you want to crawl') - book_num = input() + attributes = query.split("?")[1].split("&") + print(attributes) + url = attributes[0].split("=", 1)[1] + book_num = attributes[1].split("=")[1] if not book_num.isdigit(): print("max book num must be an integer") - main_page() + back_to_main_page() book_num = int(book_num) - print('please enter the number of authors you want to crawl') - author_num = input() + author_num = attributes[2].split("=")[1] if not author_num.isdigit(): print("max author num must be an integer") - main_page() + back_to_main_page() author_num = int(author_num) print('Request received, start scraping') if not Parser.check_if_address_valid(url): print("Error: invalid URL") - main_page() - elif book_num >= 2000 or author_num >= 2000: - print("Error, the number of book and author should be lower than 2000") - main_page() + back_to_main_page() + elif book_num >= 2000 or author_num >= 500 or book_num <= 0 or author_num <= 0: + print("Error, the number of book and author should be positive and lower than 2000/500") + back_to_main_page() else: if book_num > 200 or author_num > 50: print("Warning, maximum of the number of books or authors is large") - Main.scrape_api(url, book_num, author_num) - main_page() + result = API.post(query, {}) + return result def get(): - print("Please enter your query:") + """ GET API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() + check_and_back(query) result = API.get(query) typer.echo(result) - main_page() + print() + back_to_main_page() def put(): - print("Please enter your query:") + """ PUT API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() - print("Please select one way to load update info: input or read:") + check_and_back(query) + print("Please select one way to load update info: input or read:\n") load = input() + check_and_back(load) if load == "input": - print("Please enter json data:") + print("Please enter json data:\n") string = input() + check_and_back(string) try: data = json.loads(string) except ValueError as err: print(err) - main_page() + back_to_main_page() elif load == "read": load_dotenv() file_root = os.getenv('FILE_ROOT') - print("Please enter the name of file which is in the file root directory:") - file_path = r"" + file_root + input() + print("Please enter the name of file which is in the file root directory:\n") + name = input() + check_and_back(name) + file_path = r"" + file_root + name if os.path.isfile(file_path): with open(file_path, "r") as file: try: data = json.load(file) except ValueError as err: print(err) - main_page() + back_to_main_page() else: print("Error: file not exist") - main_page() + back_to_main_page() else: print("Unknown command") - main_page() + back_to_main_page() result = API.put(query, data) typer.echo(result) - main_page() + print() + back_to_main_page() def post(): - print("Please enter your query:") + """ POST API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() - load_dotenv() - file_root = os.getenv('FILE_ROOT') - print("Please enter the name of file which is in the file root directory:") - file_path = r"" + file_root + input() - if os.path.isfile(file_path): - with open(file_path, "r") as file: - try: - data = json.load(file) - except ValueError as err: - print(err) - main_page() + check_and_back(query) + if re.search("scrape", query): + result = scrape(query) + typer.echo(result) + print() + back_to_main_page() else: - print("Error: file not exist") - main_page() - result = API.post(query, data) - typer.echo(result) - main_page() + load_dotenv() + file_root = os.getenv('FILE_ROOT') + print("Please enter the name of file which is in the file root directory:\n") + name = input() + check_and_back(name) + file_path = r"" + file_root + name + if os.path.isfile(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except ValueError as err: + print(err) + back_to_main_page() + else: + print("Error: file not exist") + back_to_main_page() + result = API.post(query, data) + typer.echo(result) + print() + back_to_main_page() def delete(): - print("Please enter your query:") + """ DELETE API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() + check_and_back(query) result = API.delete(query) typer.echo(result) - main_page() + print() + back_to_main_page() def export(): - print("Which collection do you want to export? books or authors?") + """ export the book or author collection """ + print("==================================================================\n") + print("Which collection do you want to export: books or authors?\n") collection = input() + check_and_back(collection) if collection == "books": - json = API.get("book") + json_file = API.get("book") load_dotenv() file_root = os.getenv('FILE_ROOT') with open(file_root + "books" + ".json", "w") as output: - output.write(json) + output.write(json_file) print("books.json is stored at " + file_root) - main_page() + back_to_main_page() elif collection == "authors": - json = API.get("author") + json_file = API.get("author") load_dotenv() file_root = os.getenv('FILE_ROOT') with open(file_root + "authors" + ".json", "w") as output: - output.write(json) + output.write(json_file) print("authors.json is stored at " + file_root) - main_page() + back_to_main_page() elif collection == "back": - main_page() + back_to_main_page() else: print("Unknown collection") - main_page() - - -# def goodbye(name: str, formal: bool = False): -# """ function2 """ -# if formal: -# typer.echo(f"Goodbye Ms. {name}. Have a good day.") -# else: -# typer.echo(f"Bye {name}!") + back_to_main_page() if __name__ == "__main__": diff --git a/Crawler/Main.py b/Crawler/Main.py deleted file mode 100644 index c778b1c285e4e64d357667f754999c8cfcf6866c..0000000000000000000000000000000000000000 --- a/Crawler/Main.py +++ /dev/null @@ -1,81 +0,0 @@ -import Crawler.Spider as Spider -import time -import RegularExpressionParser.Parser as Parser -from DataBase import mongoDB as db - - -def scrape_api(url_api, book_num_api, author_num_api): - print('Request received, start scraping') - book_dict_api = {} - author_dict_api = {} - similar_book_dict_api = {} - count_api = 1 - - print("Crawling book #" + str(count_api)) - start_page_api = Spider.Spider(url_api) - start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) - - while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api): - all_related_books_api = list(similar_book_dict_api.items()) - for book_url_pair_api in all_related_books_api: - if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api): - break - book_name_api = book_url_pair_api[0] - if book_name_api in book_dict_api: - pass - else: - count_api += 1 - print("Crawling book #" + str(count_api)) - book_url_api = book_url_pair_api[1] - book_spider_api = Spider.Spider(book_url_api) - book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) - - # since the network delay from asia to goodread.com is already very high, - # a small amount of time of sleep is used here. - time.sleep(0.2) - - db.insert_dicts(book_dict_api, 0) - db.insert_dicts(author_dict_api, 1) - print("Scraping completed") - - -print('please enter URL of the book') -url = input() -print('please enter the number of books you want to crawl') -book_num = input() -print('please enter the number of authors you want to crawl') -author_num = input() -print('Request received, start scraping') - -book_dict = {} -author_dict = {} -similar_book_dict = {} -count = 1 - -print("Crawling book #" + str(count)) -start_page = Spider.Spider(url) -start_page.crawl(book_dict, author_dict, similar_book_dict) - - -while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): - all_related_books = list(similar_book_dict.items()) - for book_url_pair in all_related_books: - if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): - break - book_name = book_url_pair[0] - if book_name in book_dict: - pass - else: - count += 1 - print("Crawling book #" + str(count)) - book_url = book_url_pair[1] - book_spider = Spider.Spider(book_url) - book_spider.crawl(book_dict, author_dict, similar_book_dict) - - # since the network delay from asia to goodread.com is already very high, - # a small amount of time of sleep is used here. - time.sleep(0.2) - -db.insert_dicts(book_dict, 0) -db.insert_dicts(author_dict, 1) -print("Scraping completed") diff --git a/Crawler/Scrape.py b/Crawler/Scrape.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f97aefca3dde45cec48686887942fcfbc52771 --- /dev/null +++ b/Crawler/Scrape.py @@ -0,0 +1,38 @@ +import Crawler.Spider as Spider +import time +from DataBase import mongoDB as db + + +def scrape_api(url_api, book_num_api, author_num_api): + print('Request received, start scraping') + book_dict_api = {} + author_dict_api = {} + similar_book_dict_api = {} + count_api = 1 + + print("Crawling book #" + str(count_api)) + start_page_api = Spider.Spider(url_api) + start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + while len(book_dict_api) < int(book_num_api) or len(author_dict_api) < int(author_num_api): + all_related_books_api = list(similar_book_dict_api.items()) + for book_url_pair_api in all_related_books_api: + if len(book_dict_api) >= int(book_num_api) and len(author_dict_api) >= int(author_num_api): + break + book_name_api = book_url_pair_api[0] + if book_name_api in book_dict_api: + pass + else: + count_api += 1 + print("Crawling book #" + str(count_api)) + book_url_api = book_url_pair_api[1] + book_spider_api = Spider.Spider(book_url_api) + book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + # since the network delay from asia to goodread.com is already very high, + # a small amount of time of sleep is used here. + time.sleep(0.2) + + db.insert_dicts(book_dict_api, 0) + db.insert_dicts(author_dict_api, 1) + print("Scraping completed") \ No newline at end of file diff --git a/Crawler/Spider.py b/Crawler/Spider.py index a689789499bb30892b56c829d7c64ed6b19dcf78..bab2576aa30434e43cb80bbca36d56e4d29c8388 100644 --- a/Crawler/Spider.py +++ b/Crawler/Spider.py @@ -11,12 +11,21 @@ class Spider: self.soup = None def crawl(self, book_dict, author_dict, similar_book_dict): + """ + function used to scrape data of a certain book + :param book_dict: + :param author_dict: + :param similar_book_dict: + :return: none + """ header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} url = self.url res = requests.get(url, headers=header) self.soup = BeautifulSoup(res.text, "lxml") + + # parsing data of current book titleAndAuthor = self.soup.title.string.split(" by ") title = titleAndAuthor[0] bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0] @@ -36,6 +45,8 @@ class Spider: authorID = (author_page_link.split("show/")[1]).split(".")[0] see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"] res_similar = requests.get(see_more_link, headers=header) + + # parsing similar books of current book similar_soup = BeautifulSoup(res_similar.text, 'lxml') similar_books_and_authors = similar_soup.find_all("span", itemprop="name") similar_books = [] @@ -51,6 +62,8 @@ class Spider: bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count, review_count, imageURL, similar_books) book_dict[title] = bookPKT + + # parse data of the author of current book res_author = requests.get(author_page_link, headers=header) author_soup = BeautifulSoup(res_author.text, 'lxml') author_rating = float(author_soup.find("span", {"class": "average"}).text) diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index fc6f628379503e62f14723758ca0fab49edd61f7..6743910e05af0ad8e3418b9b462d001e8d841ad1 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -58,7 +58,7 @@ def parse_query_to_json(pair): else: # can be A.B: C or A.B: "C" if re.search(":", pair): - if re.search("^[0-9.]*$", elements[1]): + if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]): return {elements[0].split(".")[1]: float(elements[1])} else: if re.search("\".*\"", elements[1]): @@ -67,7 +67,7 @@ def parse_query_to_json(pair): return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}} else: if re.search("NOT", pair): - if re.search("^[0-9.]*$", elements[1]): + if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]): return {elements[0].split(".")[1]: {"$ne": float(elements[1])}} else: return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}} diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index ae10ffbcd06ffe8a58c0750d7d4152f6f7615eb0..8f341090932a4c875d490ecc5cc26925033a9a2a 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -1,6 +1,7 @@ import DataBase.mongoDB as DataBase import RegularExpressionParser.Parser as Parser import re +import Crawler.Scrape as Scrape from flask import Flask from flask import request from urllib.parse import urlparse, parse_qs @@ -20,21 +21,18 @@ def data_base(collection): if collection == "book": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - # print(qs_parsed["id"]) if qs_parsed == {}: return DataBase.get_documents_json(0, {}) return search_document(["book.id:" + qs_parsed["id"][0]]) elif collection == "author": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - # print(type(qs_parsed["id"])) if qs_parsed == {}: return DataBase.get_documents_json(1, {}) return search_document(["author.id:" + qs_parsed["id"][0]]) elif collection == "search": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - print(qs_parsed) return search_document(qs_parsed["q"][0].split("&")) else: return "404: Unknown Collection to GET" @@ -47,7 +45,7 @@ def data_base(collection): elif collection == "author": opt = 1 else: - return "404: Unknown Collection to GET" + return "404: Unknown Collection to PUT" DataBase.update_dicts(opt, request.args.to_dict(), json_update_info) return "200: PUT succeeded" elif request.method == "POST": @@ -63,7 +61,15 @@ def data_base(collection): elif collection == "author": DataBase.insert_document(json_file, 1) elif collection == "scrape": - return "200" + # url_parsed = urlparse(request.url) + # qs_parsed = parse_qs(url_parsed.query) + # print(qs_parsed) + param = request.args.to_dict() + url = param["url"] + max_book = param["max_book"] + max_author = param["max_author"] + Scrape.scrape_api(url, max_book, max_author) + return "201: new data has been added to database" else: return "404: Unknown Collection to POST" return "201: POST succeeded" @@ -81,6 +87,7 @@ def data_base(collection): def search_document(identifiers): + """ function used to find one or several document in database """ if len(identifiers) == 1: json_idt = Parser.parse_query_to_json(identifiers[0]) print(json_idt) @@ -96,7 +103,7 @@ def search_document(identifiers): else: opt = 0 else: - if re.search("^book.*",identifiers[2]): + if re.search("^book.*", identifiers[2]): print("Failed to find documentation: two statements are not pointing to the same collection") return {} else: