Skip to content
Snippets Groups Projects
Commit 8959e6be authored by zshan2's avatar zshan2
Browse files

assignment-2.1 version2.1: GET API fixed

parent 77982c85
No related branches found
No related tags found
1 merge request!1<assignment-2.1>
import requests
import os
import RegularExpressionParser.Parser as parser
import re
from dotenv import load_dotenv
def get(query):
"""
function used to send get request to find one or several documentations
:param query: user input query
:return: json file of the given author or book
"""
load_dotenv()
host = os.getenv('SERVER_HOST')
url = host + parser.url_safe(query)
req = requests.get(url)
return req.json()
class BookPackage: class BookPackage:
def __init__(self, url, title, id, ISBN, author_url, author, rating, def __init__(self, url, title, id, ISBN13, author_url, author, rating,
rating_count, review_count, image_url, similar_books): rating_count, review_count, image_url, similar_books):
self.url = url self.url = url
self.title = title self.title = title
self.id = id self.id = id
self.ISBN = ISBN self.ISBN13 = ISBN13
self.author_url = author_url self.author_url = author_url
self.author = author self.author = author
self.rating = rating self.rating = rating
self.rating_count = rating_count self.rating_count = rating_count
self.review_count = review_count self.review_count = review_count
self.image_url = image_url self.image_url = image_url
self.similar_books = similar_books self.similar_books = similar_books
\ No newline at end of file
import Crawler.Spider as Spider import Crawler.Spider as Spider
import Crawler.BookPack as BookPack import time
import Crawler.AuthorPack as AuthorPack from DataBase import mongoDB as db
print('please enter URL of the book') print('please enter URL of the book')
# url = input() url = input()
url = 'https://www.goodreads.com/book/show/3735293-clean-code' # url = https://www.goodreads.com/book/show/3735293-clean-code
print('URL received, start scraping') # https://www.goodreads.com/book/show/35133922-educated
print('please enter the number of books you want to crawl')
book_num = input()
# bookNum = 3
print('please enter the number of authors you want to crawl')
author_num = input()
# authorNum = 2
print('Request received, start scraping')
startPage = Spider.Spider(url) book_dict = {}
startPage.scrap() author_dict = {}
similar_book_dict = {}
count = 1
print("Crawling book #" + str(count))
start_page = Spider.Spider(url)
start_page.crawl(book_dict, author_dict, similar_book_dict)
while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
all_related_books = list(similar_book_dict.items())
for bookURLPair in all_related_books:
if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
break
book_name = bookURLPair[0]
if book_name in book_dict:
pass
else:
count += 1
print("Crawling book #" + str(count))
bookURL = bookURLPair[1]
book_spider = Spider.Spider(bookURL)
book_spider.crawl(book_dict, author_dict, similar_book_dict)
# since the network delay from asia to goodread.com is already very high,
# a small amount of time of sleep is used here.
time.sleep(0.2)
db.insert_dicts(book_dict, 0)
db.insert_dicts(author_dict, 1)
print("Scraping completed")
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import Crawler.BookPack as BookPack
import Crawler.AuthorPack as AuthorPack
class Spider: class Spider:
...@@ -8,26 +10,80 @@ class Spider: ...@@ -8,26 +10,80 @@ class Spider:
self.url = url self.url = url
self.soup = None self.soup = None
def scrap(self): def crawl(self, book_dict, author_dict, similar_book_dict):
header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = self.url url = self.url
res = requests.get(url, headers=header) res = requests.get(url, headers=header)
self.soup = BeautifulSoup(res.text, 'lxml') self.soup = BeautifulSoup(res.text, "lxml")
titleAndAuthor = self.soup.title.string.split(" by ")
title = titleAndAuthor[0]
bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
ISBN13_raw = self.soup.find("meta", property="books:isbn")
if ISBN13_raw["content"] == "null":
print("Warning: book '" + title + "' has no ISBN code")
ISBN13 = 0
else:
ISBN13 = int(ISBN13_raw["content"])
imageURL = self.soup.find("img", id="coverImage")["src"]
author = titleAndAuthor[1]
author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
authorID = (author_page_link.split("show/")[1]).split(".")[0]
see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
res_similar = requests.get(see_more_link, headers=header)
similar_soup = BeautifulSoup(res_similar.text, 'lxml')
similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
similar_books = []
for i in range(len(similar_books_and_authors)):
if i % 2 == 0:
# a book
similar_book = similar_books_and_authors[i]
similar_book_name = similar_book.text
book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
similar_books.append(similar_book_name)
similar_book_dict[similar_book_name] = book_link
seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > ' bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > ' review_count, imageURL, similar_books)
'div.bigBoxBody > div > a')[0].get("href") book_dict[title] = bookPKT
print(seeMoreLink) res_author = requests.get(author_page_link, headers=header)
author_soup = BeautifulSoup(res_author.text, 'lxml')
author_rating = float(author_soup.find("span", {"class": "average"}).text)
author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)
related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
res_related_authors = requests.get(related_authors_link, headers=header)
related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
related_authors = []
related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
for the_author in related_authors_raw:
related_author_name = the_author.text
if related_author_name == author:
pass
else:
related_authors.append(related_author_name)
author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
res_author_books = requests.get(author_books_link, headers=header)
author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
author_books = []
for book in author_books_raw:
book_name = book.findChildren("span", recursive=False)
if not book_name:
pass
else:
book_name = book_name[0].text
author_books.append(book_name)
author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
author_review_count, author_photo_url, related_authors, author_books)
author_dict[author] = author_pkt
# print(soup)
# print(soup.title.string)
# print(soup.find_all('a'))
# aTypes = soup.find_all('a')[0]
# print(aTypes)
# for obj in aTypes:
# if obj.
# print(soup.)
def parse_book_dict_to_json(dictionary):
item_list = list(dictionary.items())
return_list = []
for items in item_list:
book_package = items[1]
book_json = parse_book_to_json(book_package)
return_list.append(book_json)
return return_list
def parse_author_dict_to_json(dictionary):
item_list = list(dictionary.items())
return_list = []
for items in item_list:
author_package = items[1]
author_json = parse_author_to_json(author_package)
return_list.append(author_json)
return return_list
def parse_book_to_json(bookPKT):
book_json = {
"type": "book",
"book_url": bookPKT.url,
"title": bookPKT.title,
"id": bookPKT.id,
"ISBN": bookPKT.ISBN13,
"author_url": bookPKT.author_url,
"author": bookPKT.author,
"rating": bookPKT.rating,
"rating_count": bookPKT.rating_count,
"review_count": bookPKT.review_count,
"image_url": bookPKT.image_url,
"similar_books": bookPKT.similar_books
}
return book_json
def parse_author_to_json(authorPKT):
author_json = {
"type": "author",
"name": authorPKT.name,
"author_url": authorPKT.url,
"id": authorPKT.id,
"rating": authorPKT.rating,
"rating_count": authorPKT.rating_count,
"review_count": authorPKT.review_count,
"image_url": authorPKT.image_url,
"related_authors": authorPKT.related_authors,
"author_books": authorPKT.author_books
}
return author_json
import json
import DataBase.JsonParser as parser
import os
from dotenv import load_dotenv
from pymongo import MongoClient
def get_db():
""" return the database of goodReads crawler """
load_dotenv()
url = os.getenv('MONGODB_URL')
client = MongoClient(url)
return client.get_database("crawler_db")
def insert_dicts(dictionary, opt):
"""
Insert books or authors collection in database
:param dictionary: the dictionary to insert to collection
:param opt: =0 means books collection; =1 means authors collection
:return: no return value
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
json_list = []
if opt == 0:
json_list = parser.parse_book_dict_to_json(dictionary)
elif opt == 1:
json_list = parser.parse_author_dict_to_json(dictionary)
records.insert_many(json_list)
def update_dicts(opt, identifier, content):
"""
Update documentations in a given collection
:param opt: =0 means books collection; =1 means authors collection
:param identifier: the identifier of the documentation we want to find
:param content: the content to update
:return: no return value
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
result = records.update_many(
identifier,
{"$set": content},
)
print("matched documentation: " + str(result.matched_count))
print("modified documentation: " + str(result.modified_count))
def get_documents_json(opt, identifier):
"""
find documentations specified by the identifier and output a json data
:param opt: =0 means books collection; =1 means authors collection
:param identifier: identifier of the documents we want, {} means locate the whole collection
:return: json file of selected documentations
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return json.dumps({})
data = records.find(identifier)
file = {}
if opt == 0:
typeName = "books"
else:
typeName = "authors"
file[typeName] = []
for item in data:
item.pop("_id")
file[typeName].append(item)
return json.dumps(file)
def download_collection(opt, identifier, name):
"""
download books collection or authors collection
:param opt: =0 means books collection; =1 means authors collection
:param identifier: identifier of the documents we want to download;
empty({}) means selected all documents in given collection
:param name: file name of downloaded json
:return: JSON file of the collection
"""
json_file = get_documents_json(opt, identifier)
load_dotenv()
root = os.getenv('ROOT')
with open(root + name + ".json", "w") as output:
output.write(json_file)
def clean(opt, identifier):
"""
delete specific documents in given collection
:param opt: =0 means books collection; =1 means authors collection
:param identifier: identifier of the documents we want to delete;
empty({}) means selected all documents in given collection
:return: no return value
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
records.delete_many(identifier)
import re
def check_if_address_valid(address):
"""
Take input address and scrape date
:param addr:
:return:
"""
x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address)
if x:
return True
else:
return False
def parse_query_to_url(query):
elements = re.findall("[0-9A-Za-z_\"><]+", query)
count = len(elements)
if count == 3:
# can only be A.B:C or wrong
if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
else:
print("Invalid query.")
return ""
elif count == 4:
# a pair and one of [NOT, >, <].
if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
else:
print("Invalid query.")
return ""
elif 6 <= count <= 8:
# AND or OR operator
if re.search(".*\sAND\s.*", query):
parts = query.split(" AND ")
return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1])
elif re.search(".*\sOR\s.*", query):
parts = query.split(" OR ")
return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
else:
print("Invalid query.")
return ""
def parse_query_to_json(pair):
elements = re.findall("[0-9A-Za-z\"._]", pair)
count = len(elements)
print(count)
if count != 3 and count != 4:
print("Failed to parse query: invalid args number")
return {"wrong": "True"} # will never be founded in database
elif count == 3:
# can be A.B: C or A.B: "C"
if re.search("\".*\"", elements[2]):
return {elements[0] + "." + elements[1]: elements[2]}
else:
return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
else:
# can be NOT, >, or <
if elements[2] == "NOT":
return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
elif elements[2] == ">":
return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
elif elements[2] == "<":
return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
else:
print("Failed to parse query: unknown operator")
return {"wrong": "True"}
def url_safe(element):
return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
import RegularExpressionParser.Parser as parser
import DataBase.mongoDB as db
import re
from flask import Flask
from flask import request
from urllib.parse import urlparse, parse_qs
app = Flask(__name__)
app.config["DEBUG"] = True
@app.route("/", methods=['GET'])
def home():
return "200: successfully connected to home page\n"
@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
def data(collection):
if request.method == "GET":
if collection == "books" or collection == "book":
identifier = request.args.to_dict()
print(identifier)
print(type(identifier))
return "200: find"
elif collection == "authors" or collection == "author":
print(request.url)
return "200: find"
elif collection == "search":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
return search_document(qs_parsed["q"][0].split("&"))
else:
return "404 not found"
# elif request.method == "PUT":
def search_document(identifiers):
if len(identifiers) == 1:
json_idt = parser.parse_query_to_json(identifiers[0])
if re.search("^book.*", identifiers[0]):
return db.get_documents_json(0, json_idt)
else:
return db.get_documents_json(1, json_idt)
elif len(identifiers) == 3:
if re.search("^book.*", identifiers[0]):
if re.search("^author.*", identifiers[2]):
print("Failed to find documentation: two statements are not pointing to the same collection")
return {}
else:
opt = 0
else:
if re.search("^book.*",identifiers[2]):
print("Failed to find documentation: two statements are not pointing to the same collection")
return {}
else:
opt = 1
json_idt1 = parser.parse_query_to_json(identifiers[0])
json_idt2 = parser.parse_query_to_json(identifiers[2])
if identifiers[1] == "AND":
exp = {"$and": [json_idt1, json_idt2]}
elif identifiers[1] == "OR":
exp = {"$or": [json_idt1,json_idt2]}
else:
print("Failed to parse query: unknown operator for identifiers[1]")
return {}
return db.get_documents_json(opt, exp)
else:
return "Error, unknown identifiers"
if __name__ == "__main__":
app.run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment