Skip to content
Snippets Groups Projects
Commit a3565f17 authored by zshan2's avatar zshan2
Browse files

assignment-2.1 version3: API all deployed, CLI initially finished

parent 8959e6be
No related branches found
No related tags found
1 merge request!1<assignment-2.1>
import requests
import os
import RegularExpressionParser.Parser as parser
import re
import RegularExpressionParser.Parser as Parser
from dotenv import load_dotenv
def safe_url(query):
load_dotenv()
host = os.getenv('SERVER_HOST')
return host + Parser.url_safe(query)
def get(query):
"""
function used to send get request to find one or several documentations
:param query: user input query
:return: json file of the given author or book
"""
load_dotenv()
host = os.getenv('SERVER_HOST')
url = host + parser.url_safe(query)
print("GET: " + query)
url = safe_url(query)
req = requests.get(url)
print("response code: " + str(req.status_code))
return req.json()
def put(query, json_file):
print("PUT: " + query)
url = safe_url(query)
req = requests.put(url, json=json_file)
print("response code: " + str(req.status_code))
return req.text
def post(query, json_file):
print("POST: " + query)
url = safe_url(query)
req = requests.put(url, json=json_file)
print("response code: " + str(req.status_code))
return req.text
def delete(query):
print("DELETE: " + query)
url = safe_url(query)
req = requests.delete(url)
print("response code: " + str(req.status_code))
return req.text
import typer
import RegularExpressionParser.Parser as Parser
import Crawler.Main as Main
import API.Functions as API
import os
import json
from dotenv import load_dotenv
app = typer.Typer()
@app.command()
def scrape_book(url: str):
""" Function used to create scrape request to Server """
def main_page():
print("Welcome to use goodReads scraper!\n"
"This is main page\n"
"Usage:\n"
"scrape: scrape goodRead and load data to database\n"
"get: find a document for a book or an author from database\n"
"put: create or update a document in database\n"
"post: add one or multiple documents of book or author in database \n"
"delete: delete certain document from database\n"
"export: export a certain collection from database\n"
"exit: end the program")
order = input()
if order == "scrape":
scrape()
elif order == "get":
get()
elif order == "put":
put()
elif order == "post":
post()
elif order == "delete":
delete()
elif order == "export":
export()
elif order == "exit":
exit()
else:
print("Unknown command")
main_page()
def scrape():
"""
Function used to create scrape request to Server
Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py
"""
print('please enter URL of the book')
url = input()
print('please enter the number of books you want to crawl')
book_num = input()
if not book_num.isdigit():
print("max book num must be an integer")
main_page()
book_num = int(book_num)
print('please enter the number of authors you want to crawl')
author_num = input()
if not author_num.isdigit():
print("max author num must be an integer")
main_page()
author_num = int(author_num)
print('Request received, start scraping')
if not Parser.check_if_address_valid(url):
print("Error: invalid URL")
main_page()
elif book_num >= 2000 or author_num >= 2000:
print("Error, the number of book and author should be lower than 2000")
main_page()
else:
if book_num > 200 or author_num > 50:
print("Warning, maximum of the number of books or authors is large")
Main.scrape_api(url, book_num, author_num)
main_page()
@app.command()
def goodbye(name: str, formal: bool = False):
""" function2 """
if formal:
typer.echo(f"Goodbye Ms. {name}. Have a good day.")
def get():
print("Please enter your query:")
query = input()
result = API.get(query)
typer.echo(result)
main_page()
def put():
print("Please enter your query:")
query = input()
print("Please select one way to load update info: input or read:")
load = input()
if load == "input":
print("Please enter json data:")
string = input()
try:
data = json.loads(string)
except ValueError as err:
print(err)
main_page()
elif load == "read":
load_dotenv()
file_root = os.getenv('FILE_ROOT')
print("Please enter the name of file which is in the file root directory:")
file_path = r"" + file_root + input()
if os.path.isfile(file_path):
with open(file_path, "r") as file:
try:
data = json.load(file)
except ValueError as err:
print(err)
main_page()
else:
print("Error: file not exist")
main_page()
else:
typer.echo(f"Bye {name}!")
print("Unknown command")
main_page()
result = API.put(query, data)
typer.echo(result)
main_page()
def post():
print("Please enter your query:")
query = input()
load_dotenv()
file_root = os.getenv('FILE_ROOT')
print("Please enter the name of file which is in the file root directory:")
file_path = r"" + file_root + input()
if os.path.isfile(file_path):
with open(file_path, "r") as file:
try:
data = json.load(file)
except ValueError as err:
print(err)
main_page()
else:
print("Error: file not exist")
main_page()
result = API.post(query, data)
typer.echo(result)
main_page()
def delete():
print("Please enter your query:")
query = input()
result = API.delete(query)
typer.echo(result)
main_page()
def export():
print("Which collection do you want to export? books or authors?")
collection = input()
if collection == "books":
json = API.get("book")
load_dotenv()
file_root = os.getenv('FILE_ROOT')
with open(file_root + "books" + ".json", "w") as output:
output.write(json)
print("books.json is stored at " + file_root)
main_page()
elif collection == "authors":
json = API.get("author")
load_dotenv()
file_root = os.getenv('FILE_ROOT')
with open(file_root + "authors" + ".json", "w") as output:
output.write(json)
print("authors.json is stored at " + file_root)
main_page()
elif collection == "back":
main_page()
else:
print("Unknown collection")
main_page()
# def goodbye(name: str, formal: bool = False):
# """ function2 """
# if formal:
# typer.echo(f"Goodbye Ms. {name}. Have a good day.")
# else:
# typer.echo(f"Bye {name}!")
if __name__ == "__main__":
app()
\ No newline at end of file
app()
import Crawler.Spider as Spider
import time
import RegularExpressionParser.Parser as Parser
from DataBase import mongoDB as db
def scrape_api(url_api, book_num_api, author_num_api):
print('Request received, start scraping')
book_dict_api = {}
author_dict_api = {}
similar_book_dict_api = {}
count_api = 1
print("Crawling book #" + str(count_api))
start_page_api = Spider.Spider(url_api)
start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api):
all_related_books_api = list(similar_book_dict_api.items())
for book_url_pair_api in all_related_books_api:
if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api):
break
book_name_api = book_url_pair_api[0]
if book_name_api in book_dict_api:
pass
else:
count_api += 1
print("Crawling book #" + str(count_api))
book_url_api = book_url_pair_api[1]
book_spider_api = Spider.Spider(book_url_api)
book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
# since the network delay from asia to goodread.com is already very high,
# a small amount of time of sleep is used here.
time.sleep(0.2)
db.insert_dicts(book_dict_api, 0)
db.insert_dicts(author_dict_api, 1)
print("Scraping completed")
print('please enter URL of the book')
url = input()
# url = https://www.goodreads.com/book/show/3735293-clean-code
# https://www.goodreads.com/book/show/35133922-educated
print('please enter the number of books you want to crawl')
book_num = input()
# bookNum = 3
print('please enter the number of authors you want to crawl')
author_num = input()
# authorNum = 2
print('Request received, start scraping')
book_dict = {}
......@@ -27,17 +59,17 @@ start_page.crawl(book_dict, author_dict, similar_book_dict)
while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
all_related_books = list(similar_book_dict.items())
for bookURLPair in all_related_books:
for book_url_pair in all_related_books:
if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
break
book_name = bookURLPair[0]
book_name = book_url_pair[0]
if book_name in book_dict:
pass
else:
count += 1
print("Crawling book #" + str(count))
bookURL = bookURLPair[1]
book_spider = Spider.Spider(bookURL)
book_url = book_url_pair[1]
book_spider = Spider.Spider(book_url)
book_spider.crawl(book_dict, author_dict, similar_book_dict)
# since the network delay from asia to goodread.com is already very high,
......
......@@ -13,6 +13,18 @@ def get_db():
return client.get_database("crawler_db")
def insert_document(docu, opt):
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
records.insert_one(docu)
def insert_dicts(dictionary, opt):
"""
Insert books or authors collection in database
......@@ -49,9 +61,13 @@ def update_dicts(opt, identifier, content):
records = db.books
elif opt == 1:
records = db.authors
result = records.update_many(
else:
print("failed to get json file: wrong opt for selecting collection")
return
result = records.update_one(
identifier,
{"$set": content},
upsert=True
)
print("matched documentation: " + str(result.matched_count))
print("modified documentation: " + str(result.modified_count))
......@@ -96,8 +112,8 @@ def download_collection(opt, identifier, name):
"""
json_file = get_documents_json(opt, identifier)
load_dotenv()
root = os.getenv('ROOT')
with open(root + name + ".json", "w") as output:
file_root = os.getenv('FILE_ROOT')
with open(file_root + name + ".json", "w") as output:
output.write(json_file)
......
......@@ -49,29 +49,35 @@ def parse_query_to_url(query):
def parse_query_to_json(pair):
elements = re.findall("[0-9A-Za-z\"._]", pair)
print(pair)
elements = re.findall("[0-9A-Za-z\"_.]+", pair)
count = len(elements)
print(count)
if count != 3 and count != 4:
if count != 2:
print("Failed to parse query: invalid args number")
return {"wrong": "True"} # will never be founded in database
elif count == 3:
# can be A.B: C or A.B: "C"
if re.search("\".*\"", elements[2]):
return {elements[0] + "." + elements[1]: elements[2]}
else:
return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
else:
# can be NOT, >, or <
if elements[2] == "NOT":
return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
elif elements[2] == ">":
return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
elif elements[2] == "<":
return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
# can be A.B: C or A.B: "C"
if re.search(":", pair):
if re.search("^[0-9.]*$", elements[1]):
return {elements[0].split(".")[1]: float(elements[1])}
else:
if re.search("\".*\"", elements[1]):
return {elements[0].split(".")[1]: elements[1]}
else:
return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
else:
print("Failed to parse query: unknown operator")
return {"wrong": "True"}
if re.search("NOT", pair):
if re.search("^[0-9.]*$", elements[1]):
return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
else:
return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
elif re.search(">", pair):
return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
elif re.search("<", pair):
return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
else:
print("Failed to parse query: unknown operator")
return {"wrong": "True"}
def url_safe(element):
......
import RegularExpressionParser.Parser as parser
import DataBase.mongoDB as db
import DataBase.mongoDB as DataBase
import RegularExpressionParser.Parser as Parser
import re
from flask import Flask
from flask import request
from urllib.parse import urlparse, parse_qs
app = Flask(__name__)
app.config["DEBUG"] = True
# app.config["DEBUG"] = True
@app.route("/", methods=['GET'])
......@@ -15,32 +15,79 @@ def home():
@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
def data(collection):
def data_base(collection):
if request.method == "GET":
if collection == "books" or collection == "book":
identifier = request.args.to_dict()
print(identifier)
print(type(identifier))
return "200: find"
elif collection == "authors" or collection == "author":
print(request.url)
return "200: find"
if collection == "book":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
# print(qs_parsed["id"])
if qs_parsed == {}:
return DataBase.get_documents_json(0, {})
return search_document(["book.id:" + qs_parsed["id"][0]])
elif collection == "author":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
# print(type(qs_parsed["id"]))
if qs_parsed == {}:
return DataBase.get_documents_json(1, {})
return search_document(["author.id:" + qs_parsed["id"][0]])
elif collection == "search":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
print(qs_parsed)
return search_document(qs_parsed["q"][0].split("&"))
else:
return "404 not found"
# elif request.method == "PUT":
return "404: Unknown Collection to GET"
elif request.method == "PUT":
if request.headers["Content-Type"] != "application/json":
return "415: content should be JSON file"
json_update_info = request.json
if collection == "book":
opt = 0
elif collection == "author":
opt = 1
else:
return "404: Unknown Collection to GET"
DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
return "200: PUT succeeded"
elif request.method == "POST":
if request.headers["Content-Type"] != "application/json":
return "415: content should be JSON file"
json_file = request.json
if collection == "books":
DataBase.insert_dicts(json_file, 0)
elif collection == "authors":
DataBase.insert_dicts(json_file, 1)
elif collection == "book":
DataBase.insert_document(json_file, 0)
elif collection == "author":
DataBase.insert_document(json_file, 1)
elif collection == "scrape":
return "200"
else:
return "404: Unknown Collection to POST"
return "201: POST succeeded"
elif request.method == "DELETE":
identifier = request.args.to_dict()
print(identifier)
if collection == "book":
opt = 0
elif collection == "author":
opt = 1
else:
return "404: Unknown Collection to DELETE"
DataBase.clean(opt, identifier)
return "200: DELETE succeeded"
def search_document(identifiers):
if len(identifiers) == 1:
json_idt = parser.parse_query_to_json(identifiers[0])
json_idt = Parser.parse_query_to_json(identifiers[0])
print(json_idt)
if re.search("^book.*", identifiers[0]):
return db.get_documents_json(0, json_idt)
return DataBase.get_documents_json(0, json_idt)
else:
return db.get_documents_json(1, json_idt)
return DataBase.get_documents_json(1, json_idt)
elif len(identifiers) == 3:
if re.search("^book.*", identifiers[0]):
if re.search("^author.*", identifiers[2]):
......@@ -54,16 +101,18 @@ def search_document(identifiers):
return {}
else:
opt = 1
json_idt1 = parser.parse_query_to_json(identifiers[0])
json_idt2 = parser.parse_query_to_json(identifiers[2])
json_idt1 = Parser.parse_query_to_json(identifiers[0])
json_idt2 = Parser.parse_query_to_json(identifiers[2])
if identifiers[1] == "AND":
exp = {"$and": [json_idt1, json_idt2]}
elif identifiers[1] == "OR":
exp = {"$or": [json_idt1,json_idt2]}
exp = {"$or": [json_idt1, json_idt2]}
else:
print("Failed to parse query: unknown operator for identifiers[1]")
return {}
return db.get_documents_json(opt, exp)
print("exp:")
print(exp)
return DataBase.get_documents_json(opt, exp)
else:
return "Error, unknown identifiers"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment