Skip to content
Snippets Groups Projects
Commit 8959e6be authored by zshan2's avatar zshan2
Browse files

assignment-2.1 version2.1: GET API fixed

parent 77982c85
No related branches found
No related tags found
1 merge request!1<assignment-2.1>
import requests
import os
import RegularExpressionParser.Parser as parser
import re
from dotenv import load_dotenv
def get(query):
"""
function used to send get request to find one or several documentations
:param query: user input query
:return: json file of the given author or book
"""
load_dotenv()
host = os.getenv('SERVER_HOST')
url = host + parser.url_safe(query)
req = requests.get(url)
return req.json()
class BookPackage:
def __init__(self, url, title, id, ISBN, author_url, author, rating,
def __init__(self, url, title, id, ISBN13, author_url, author, rating,
rating_count, review_count, image_url, similar_books):
self.url = url
self.title = title
self.id = id
self.ISBN = ISBN
self.ISBN13 = ISBN13
self.author_url = author_url
self.author = author
self.rating = rating
self.rating_count = rating_count
self.review_count = review_count
self.image_url = image_url
self.similar_books = similar_books
\ No newline at end of file
self.similar_books = similar_books
import Crawler.Spider as Spider
import Crawler.BookPack as BookPack
import Crawler.AuthorPack as AuthorPack
import time
from DataBase import mongoDB as db
print('please enter URL of the book')
# url = input()
url = 'https://www.goodreads.com/book/show/3735293-clean-code'
print('URL received, start scraping')
url = input()
# url = https://www.goodreads.com/book/show/3735293-clean-code
# https://www.goodreads.com/book/show/35133922-educated
print('please enter the number of books you want to crawl')
book_num = input()
# bookNum = 3
print('please enter the number of authors you want to crawl')
author_num = input()
# authorNum = 2
print('Request received, start scraping')
startPage = Spider.Spider(url)
startPage.scrap()
book_dict = {}
author_dict = {}
similar_book_dict = {}
count = 1
print("Crawling book #" + str(count))
start_page = Spider.Spider(url)
start_page.crawl(book_dict, author_dict, similar_book_dict)
while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
all_related_books = list(similar_book_dict.items())
for bookURLPair in all_related_books:
if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
break
book_name = bookURLPair[0]
if book_name in book_dict:
pass
else:
count += 1
print("Crawling book #" + str(count))
bookURL = bookURLPair[1]
book_spider = Spider.Spider(bookURL)
book_spider.crawl(book_dict, author_dict, similar_book_dict)
# since the network delay from asia to goodread.com is already very high,
# a small amount of time of sleep is used here.
time.sleep(0.2)
db.insert_dicts(book_dict, 0)
db.insert_dicts(author_dict, 1)
print("Scraping completed")
import requests
from bs4 import BeautifulSoup
import Crawler.BookPack as BookPack
import Crawler.AuthorPack as AuthorPack
class Spider:
......@@ -8,26 +10,80 @@ class Spider:
self.url = url
self.soup = None
def scrap(self):
header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
def crawl(self, book_dict, author_dict, similar_book_dict):
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = self.url
res = requests.get(url, headers=header)
self.soup = BeautifulSoup(res.text, 'lxml')
self.soup = BeautifulSoup(res.text, "lxml")
titleAndAuthor = self.soup.title.string.split(" by ")
title = titleAndAuthor[0]
bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
ISBN13_raw = self.soup.find("meta", property="books:isbn")
if ISBN13_raw["content"] == "null":
print("Warning: book '" + title + "' has no ISBN code")
ISBN13 = 0
else:
ISBN13 = int(ISBN13_raw["content"])
imageURL = self.soup.find("img", id="coverImage")["src"]
author = titleAndAuthor[1]
author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
authorID = (author_page_link.split("show/")[1]).split(".")[0]
see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
res_similar = requests.get(see_more_link, headers=header)
similar_soup = BeautifulSoup(res_similar.text, 'lxml')
similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
similar_books = []
for i in range(len(similar_books_and_authors)):
if i % 2 == 0:
# a book
similar_book = similar_books_and_authors[i]
similar_book_name = similar_book.text
book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
similar_books.append(similar_book_name)
similar_book_dict[similar_book_name] = book_link
seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
'div.bigBoxBody > div > a')[0].get("href")
print(seeMoreLink)
bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
review_count, imageURL, similar_books)
book_dict[title] = bookPKT
res_author = requests.get(author_page_link, headers=header)
author_soup = BeautifulSoup(res_author.text, 'lxml')
author_rating = float(author_soup.find("span", {"class": "average"}).text)
author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)
related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
res_related_authors = requests.get(related_authors_link, headers=header)
related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
related_authors = []
related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
for the_author in related_authors_raw:
related_author_name = the_author.text
if related_author_name == author:
pass
else:
related_authors.append(related_author_name)
author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
res_author_books = requests.get(author_books_link, headers=header)
author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
author_books = []
for book in author_books_raw:
book_name = book.findChildren("span", recursive=False)
if not book_name:
pass
else:
book_name = book_name[0].text
author_books.append(book_name)
author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
author_review_count, author_photo_url, related_authors, author_books)
author_dict[author] = author_pkt
# print(soup)
# print(soup.title.string)
# print(soup.find_all('a'))
# aTypes = soup.find_all('a')[0]
# print(aTypes)
# for obj in aTypes:
# if obj.
# print(soup.)
def parse_book_dict_to_json(dictionary):
item_list = list(dictionary.items())
return_list = []
for items in item_list:
book_package = items[1]
book_json = parse_book_to_json(book_package)
return_list.append(book_json)
return return_list
def parse_author_dict_to_json(dictionary):
item_list = list(dictionary.items())
return_list = []
for items in item_list:
author_package = items[1]
author_json = parse_author_to_json(author_package)
return_list.append(author_json)
return return_list
def parse_book_to_json(bookPKT):
book_json = {
"type": "book",
"book_url": bookPKT.url,
"title": bookPKT.title,
"id": bookPKT.id,
"ISBN": bookPKT.ISBN13,
"author_url": bookPKT.author_url,
"author": bookPKT.author,
"rating": bookPKT.rating,
"rating_count": bookPKT.rating_count,
"review_count": bookPKT.review_count,
"image_url": bookPKT.image_url,
"similar_books": bookPKT.similar_books
}
return book_json
def parse_author_to_json(authorPKT):
author_json = {
"type": "author",
"name": authorPKT.name,
"author_url": authorPKT.url,
"id": authorPKT.id,
"rating": authorPKT.rating,
"rating_count": authorPKT.rating_count,
"review_count": authorPKT.review_count,
"image_url": authorPKT.image_url,
"related_authors": authorPKT.related_authors,
"author_books": authorPKT.author_books
}
return author_json
import json
import DataBase.JsonParser as parser
import os
from dotenv import load_dotenv
from pymongo import MongoClient
def get_db():
""" return the database of goodReads crawler """
load_dotenv()
url = os.getenv('MONGODB_URL')
client = MongoClient(url)
return client.get_database("crawler_db")
def insert_dicts(dictionary, opt):
"""
Insert books or authors collection in database
:param dictionary: the dictionary to insert to collection
:param opt: =0 means books collection; =1 means authors collection
:return: no return value
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
json_list = []
if opt == 0:
json_list = parser.parse_book_dict_to_json(dictionary)
elif opt == 1:
json_list = parser.parse_author_dict_to_json(dictionary)
records.insert_many(json_list)
def update_dicts(opt, identifier, content):
"""
Update documentations in a given collection
:param opt: =0 means books collection; =1 means authors collection
:param identifier: the identifier of the documentation we want to find
:param content: the content to update
:return: no return value
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
result = records.update_many(
identifier,
{"$set": content},
)
print("matched documentation: " + str(result.matched_count))
print("modified documentation: " + str(result.modified_count))
def get_documents_json(opt, identifier):
"""
find documentations specified by the identifier and output a json data
:param opt: =0 means books collection; =1 means authors collection
:param identifier: identifier of the documents we want, {} means locate the whole collection
:return: json file of selected documentations
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return json.dumps({})
data = records.find(identifier)
file = {}
if opt == 0:
typeName = "books"
else:
typeName = "authors"
file[typeName] = []
for item in data:
item.pop("_id")
file[typeName].append(item)
return json.dumps(file)
def download_collection(opt, identifier, name):
"""
download books collection or authors collection
:param opt: =0 means books collection; =1 means authors collection
:param identifier: identifier of the documents we want to download;
empty({}) means selected all documents in given collection
:param name: file name of downloaded json
:return: JSON file of the collection
"""
json_file = get_documents_json(opt, identifier)
load_dotenv()
root = os.getenv('ROOT')
with open(root + name + ".json", "w") as output:
output.write(json_file)
def clean(opt, identifier):
"""
delete specific documents in given collection
:param opt: =0 means books collection; =1 means authors collection
:param identifier: identifier of the documents we want to delete;
empty({}) means selected all documents in given collection
:return: no return value
"""
db = get_db()
if opt == 0:
records = db.books
elif opt == 1:
records = db.authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
records.delete_many(identifier)
import re
def check_if_address_valid(address):
"""
Take input address and scrape date
:param addr:
:return:
"""
x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address)
if x:
return True
else:
return False
def parse_query_to_url(query):
elements = re.findall("[0-9A-Za-z_\"><]+", query)
count = len(elements)
if count == 3:
# can only be A.B:C or wrong
if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
else:
print("Invalid query.")
return ""
elif count == 4:
# a pair and one of [NOT, >, <].
if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
else:
print("Invalid query.")
return ""
elif 6 <= count <= 8:
# AND or OR operator
if re.search(".*\sAND\s.*", query):
parts = query.split(" AND ")
return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1])
elif re.search(".*\sOR\s.*", query):
parts = query.split(" OR ")
return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
else:
print("Invalid query.")
return ""
def parse_query_to_json(pair):
elements = re.findall("[0-9A-Za-z\"._]", pair)
count = len(elements)
print(count)
if count != 3 and count != 4:
print("Failed to parse query: invalid args number")
return {"wrong": "True"} # will never be founded in database
elif count == 3:
# can be A.B: C or A.B: "C"
if re.search("\".*\"", elements[2]):
return {elements[0] + "." + elements[1]: elements[2]}
else:
return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
else:
# can be NOT, >, or <
if elements[2] == "NOT":
return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
elif elements[2] == ">":
return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
elif elements[2] == "<":
return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
else:
print("Failed to parse query: unknown operator")
return {"wrong": "True"}
def url_safe(element):
return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
import RegularExpressionParser.Parser as parser
import DataBase.mongoDB as db
import re
from flask import Flask
from flask import request
from urllib.parse import urlparse, parse_qs
app = Flask(__name__)
app.config["DEBUG"] = True
@app.route("/", methods=['GET'])
def home():
return "200: successfully connected to home page\n"
@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
def data(collection):
if request.method == "GET":
if collection == "books" or collection == "book":
identifier = request.args.to_dict()
print(identifier)
print(type(identifier))
return "200: find"
elif collection == "authors" or collection == "author":
print(request.url)
return "200: find"
elif collection == "search":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
return search_document(qs_parsed["q"][0].split("&"))
else:
return "404 not found"
# elif request.method == "PUT":
def search_document(identifiers):
if len(identifiers) == 1:
json_idt = parser.parse_query_to_json(identifiers[0])
if re.search("^book.*", identifiers[0]):
return db.get_documents_json(0, json_idt)
else:
return db.get_documents_json(1, json_idt)
elif len(identifiers) == 3:
if re.search("^book.*", identifiers[0]):
if re.search("^author.*", identifiers[2]):
print("Failed to find documentation: two statements are not pointing to the same collection")
return {}
else:
opt = 0
else:
if re.search("^book.*",identifiers[2]):
print("Failed to find documentation: two statements are not pointing to the same collection")
return {}
else:
opt = 1
json_idt1 = parser.parse_query_to_json(identifiers[0])
json_idt2 = parser.parse_query_to_json(identifiers[2])
if identifiers[1] == "AND":
exp = {"$and": [json_idt1, json_idt2]}
elif identifiers[1] == "OR":
exp = {"$or": [json_idt1,json_idt2]}
else:
print("Failed to parse query: unknown operator for identifiers[1]")
return {}
return db.get_documents_json(opt, exp)
else:
return "Error, unknown identifiers"
if __name__ == "__main__":
app.run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment