diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index db9c680a4bad33fd6b757db2e487568cdbd881ba..131d81c8a75cfc887632b568d938c297b493ae0b 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -33,14 +33,14 @@ PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0") while not tag_exists(RELEASE_TAG): RELEASE_TAG = raw_input("Please provide a valid release tag: ") while not tag_exists(PREVIOUS_RELEASE_TAG): - print "Please specify the previous release tag." - PREVIOUS_RELEASE_TAG = raw_input(\ - "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") + print("Please specify the previous release tag.") + PREVIOUS_RELEASE_TAG = raw_input( + "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") # Gather commits found in the new tag but not in the old tag. # This filters commits based on both the git hash and the PR number. # If either is present in the old tag, then we ignore the commit. -print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG) +print("Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)) release_commits = get_commits(RELEASE_TAG) previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG) previous_release_hashes = set() @@ -62,17 +62,20 @@ if not new_commits: sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)) # Prompt the user for confirmation that the commit range is correct -print "\n==================================================================================" -print "JIRA server: %s" % JIRA_API_BASE -print "Release tag: %s" % RELEASE_TAG -print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG -print "Number of commits in this range: %s" % len(new_commits) +print("\n==================================================================================") +print("JIRA server: %s" % JIRA_API_BASE) +print("Release tag: %s" % RELEASE_TAG) +print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG) +print("Number of commits in this range: %s" % len(new_commits)) print + + def print_indented(_list): - for x in _list: print " %s" % x + for x in _list: + print(" %s" % x) if yesOrNoPrompt("Show all commits?"): print_indented(new_commits) -print "==================================================================================\n" +print("==================================================================================\n") if not yesOrNoPrompt("Does this look correct?"): sys.exit("Ok, exiting") @@ -82,45 +85,76 @@ maintenance = [] reverts = [] nojiras = [] filtered_commits = [] + + def is_release(commit_title): - return re.findall("\[release\]", commit_title.lower()) or\ - "preparing spark release" in commit_title.lower() or\ - "preparing development version" in commit_title.lower() or\ - "CHANGES.txt" in commit_title + return re.findall("\[release\]", commit_title.lower()) or \ + "preparing spark release" in commit_title.lower() or \ + "preparing development version" in commit_title.lower() or \ + "CHANGES.txt" in commit_title + + def is_maintenance(commit_title): - return "maintenance" in commit_title.lower() or\ - "manually close" in commit_title.lower() + return "maintenance" in commit_title.lower() or \ + "manually close" in commit_title.lower() + + def has_no_jira(commit_title): return not re.findall("SPARK-[0-9]+", commit_title.upper()) + + def is_revert(commit_title): return "revert" in commit_title.lower() + + def is_docs(commit_title): - return re.findall("docs*", commit_title.lower()) or\ - "programming guide" in commit_title.lower() + return re.findall("docs*", commit_title.lower()) or \ + "programming guide" in commit_title.lower() + + for c in new_commits: t = c.get_title() - if not t: continue - elif is_release(t): releases.append(c) - elif is_maintenance(t): maintenance.append(c) - elif is_revert(t): reverts.append(c) - elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers - elif has_no_jira(t): nojiras.append(c) - else: filtered_commits.append(c) + if not t: + continue + elif is_release(t): + releases.append(c) + elif is_maintenance(t): + maintenance.append(c) + elif is_revert(t): + reverts.append(c) + elif is_docs(t): + filtered_commits.append(c) # docs may not have JIRA numbers + elif has_no_jira(t): + nojiras.append(c) + else: + filtered_commits.append(c) # Warn against ignored commits if releases or maintenance or reverts or nojiras: - print "\n==================================================================================" - if releases: print "Found %d release commits" % len(releases) - if maintenance: print "Found %d maintenance commits" % len(maintenance) - if reverts: print "Found %d revert commits" % len(reverts) - if nojiras: print "Found %d commits with no JIRA" % len(nojiras) - print "* Warning: these commits will be ignored.\n" + print("\n==================================================================================") + if releases: + print("Found %d release commits" % len(releases)) + if maintenance: + print("Found %d maintenance commits" % len(maintenance)) + if reverts: + print("Found %d revert commits" % len(reverts)) + if nojiras: + print("Found %d commits with no JIRA" % len(nojiras)) + print("* Warning: these commits will be ignored.\n") if yesOrNoPrompt("Show ignored commits?"): - if releases: print "Release (%d)" % len(releases); print_indented(releases) - if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance) - if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts) - if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) - print "==================== Warning: the above commits will be ignored ==================\n" + if releases: + print("Release (%d)" % len(releases)) + print_indented(releases) + if maintenance: + print("Maintenance (%d)" % len(maintenance)) + print_indented(maintenance) + if reverts: + print("Revert (%d)" % len(reverts)) + print_indented(reverts) + if nojiras: + print("No JIRA (%d)" % len(nojiras)) + print_indented(nojiras) + print("==================== Warning: the above commits will be ignored ==================\n") prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits) if not yesOrNoPrompt(prompt_msg): sys.exit("Ok, exiting.") @@ -147,9 +181,9 @@ invalid_authors = {} # } # author_info = {} -jira_options = { "server": JIRA_API_BASE } -jira_client = JIRA(options = jira_options) -print "\n=========================== Compiling contributor list ===========================" +jira_options = {"server": JIRA_API_BASE} +jira_client = JIRA(options=jira_options) +print("\n=========================== Compiling contributor list ===========================") for commit in filtered_commits: _hash = commit.get_hash() title = commit.get_title() @@ -168,8 +202,9 @@ for commit in filtered_commits: # Parse components from the commit title, if any commit_components = find_components(title, _hash) # Populate or merge an issue into author_info[author] + def populate(issue_type, components): - components = components or [CORE_COMPONENT] # assume core if no components provided + components = components or [CORE_COMPONENT] # assume core if no components provided if author not in author_info: author_info[author] = {} if issue_type not in author_info[author]: @@ -182,17 +217,17 @@ for commit in filtered_commits: jira_issue = jira_client.issue(issue) jira_type = jira_issue.fields.issuetype.name jira_type = translate_issue_type(jira_type, issue, warnings) - jira_components = [translate_component(c.name, _hash, warnings)\ - for c in jira_issue.fields.components] + jira_components = [translate_component(c.name, _hash, warnings) + for c in jira_issue.fields.components] all_components = set(jira_components + commit_components) populate(jira_type, all_components) except Exception as e: - print "Unexpected error:", e + print("Unexpected error:", e) # For docs without an associated JIRA, manually add it ourselves if is_docs(title) and not issues: populate("documentation", commit_components) - print " Processed commit %s authored by %s on %s" % (_hash, author, date) -print "==================================================================================\n" + print(" Processed commit %s authored by %s on %s" % (_hash, author, date)) +print("==================================================================================\n") # Write to contributors file ordered by author names # Each line takes the format " * Author name -- semi-colon delimited contributions" @@ -215,8 +250,8 @@ for author in authors: # Otherwise, group contributions by issue types instead of modules # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN else: - contributions = ["%s in %s" % (issue_type, nice_join(comps)) \ - for issue_type, comps in author_info[author].items()] + contributions = ["%s in %s" % (issue_type, nice_join(comps)) + for issue_type, comps in author_info[author].items()] contribution = "; ".join(contributions) # Do not use python's capitalize() on the whole string to preserve case assert contribution @@ -226,11 +261,11 @@ for author in authors: # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672 if author in invalid_authors and invalid_authors[author]: author = author + "/" + "/".join(invalid_authors[author]) - #line = " * %s -- %s" % (author, contribution) + # line = " * %s -- %s" % (author, contribution) line = author contributors_file.write(line + "\n") contributors_file.close() -print "Contributors list is successfully written to %s!" % contributors_file_name +print("Contributors list is successfully written to %s!" % contributors_file_name) # Prompt the user to translate author names if necessary if invalid_authors: @@ -241,8 +276,8 @@ if invalid_authors: # Log any warnings encountered in the process if warnings: - print "\n============ Warnings encountered while creating the contributor list ============" - for w in warnings: print w - print "Please correct these in the final contributors list at %s." % contributors_file_name - print "==================================================================================\n" - + print("\n============ Warnings encountered while creating the contributor list ============") + for w in warnings: + print(w) + print("Please correct these in the final contributors list at %s." % contributors_file_name) + print("==================================================================================\n") diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 5d0ac16b3b0a12a0c83433b00f9d3638312ccda5..730138195e5feec65d4ce49234e901a3e54c3b65 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -30,28 +30,29 @@ try: except ImportError: from jira.utils import JIRAError except ImportError: - print "This tool requires the jira-python library" - print "Install using 'sudo pip install jira'" + print("This tool requires the jira-python library") + print("Install using 'sudo pip install jira'") sys.exit(-1) try: from github import Github from github import GithubException except ImportError: - print "This tool requires the PyGithub library" - print "Install using 'sudo pip install PyGithub'" + print("This tool requires the PyGithub library") + print("Install using 'sudo pip install PyGithub'") sys.exit(-1) try: import unidecode except ImportError: - print "This tool requires the unidecode library to decode obscure github usernames" - print "Install using 'sudo pip install unidecode'" + print("This tool requires the unidecode library to decode obscure github usernames") + print("Install using 'sudo pip install unidecode'") sys.exit(-1) # Contributors list file name contributors_file_name = "contributors.txt" + # Prompt the user to answer yes or no until they do so def yesOrNoPrompt(msg): response = raw_input("%s [y/n]: " % msg) @@ -59,30 +60,50 @@ def yesOrNoPrompt(msg): return yesOrNoPrompt(msg) return response == "y" + # Utility functions run git commands (written with Git 1.8.5) -def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] -def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1] +def run_cmd(cmd): + return Popen(cmd, stdout=PIPE).communicate()[0] + + +def run_cmd_error(cmd): + return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1] + + def get_date(commit_hash): return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) + + def tag_exists(tag): stderr = run_cmd_error(["git", "show", tag]) return "error" not in stderr + # A type-safe representation of a commit class Commit: - def __init__(self, _hash, author, title, pr_number = None): + def __init__(self, _hash, author, title, pr_number=None): self._hash = _hash self.author = author self.title = title self.pr_number = pr_number - def get_hash(self): return self._hash - def get_author(self): return self.author - def get_title(self): return self.title - def get_pr_number(self): return self.pr_number + + def get_hash(self): + return self._hash + + def get_author(self): + return self.author + + def get_title(self): + return self.title + + def get_pr_number(self): + return self.pr_number + def __str__(self): closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else "" return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr) + # Return all commits that belong to the specified tag. # # Under the hood, this runs a `git log` on that tag and parses the fields @@ -106,8 +127,9 @@ def get_commits(tag): raw_commits = [c for c in output.split(commit_start_marker) if c] for commit in raw_commits: if commit.count(commit_end_marker) != 1: - print "Commit end marker not found in commit: " - for line in commit.split("\n"): print line + print("Commit end marker not found in commit: ") + for line in commit.split("\n"): + print(line) sys.exit(1) # Separate commit digest from the body # From the digest we extract the hash, author and the title @@ -178,6 +200,7 @@ known_components = { "yarn": "YARN" } + # Translate issue types using a format appropriate for writing contributions # If an unknown issue type is encountered, warn the user def translate_issue_type(issue_type, issue_id, warnings): @@ -188,6 +211,7 @@ def translate_issue_type(issue_type, issue_id, warnings): warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id)) return issue_type + # Translate component names using a format appropriate for writing contributions # If an unknown component is encountered, warn the user def translate_component(component, commit_hash, warnings): @@ -198,20 +222,22 @@ def translate_component(component, commit_hash, warnings): warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash)) return component + # Parse components in the commit message # The returned components are already filtered and translated def find_components(commit, commit_hash): components = re.findall("\[\w*\]", commit.lower()) - components = [translate_component(c, commit_hash)\ - for c in components if c in known_components] + components = [translate_component(c, commit_hash) + for c in components if c in known_components] return components + # Join a list of strings in a human-readable manner # e.g. ["Juice"] -> "Juice" # e.g. ["Juice", "baby"] -> "Juice and baby" # e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon" def nice_join(str_list): - str_list = list(str_list) # sometimes it's a set + str_list = list(str_list) # sometimes it's a set if not str_list: return "" elif len(str_list) == 1: @@ -221,6 +247,7 @@ def nice_join(str_list): else: return ", ".join(str_list[:-1]) + ", and " + str_list[-1] + # Return the full name of the specified user on Github # If the user doesn't exist, return None def get_github_name(author, github_client): @@ -233,6 +260,7 @@ def get_github_name(author, github_client): raise e return None + # Return the full name of the specified user on JIRA # If the user doesn't exist, return None def get_jira_name(author, jira_client): @@ -245,15 +273,18 @@ def get_jira_name(author, jira_client): raise e return None + # Return whether the given name is in the form <First Name><space><Last Name> def is_valid_author(author): - if not author: return False + if not author: + return False return " " in author and not re.findall("[0-9]", author) + # Capitalize the first letter of each word in the given author name def capitalize_author(author): - if not author: return None + if not author: + return None words = author.split(" ") words = [w[0].capitalize() + w[1:] for w in words if w] return " ".join(words) - diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 2cc64e44448bcbfc40167bc0e86410d3ac15a827..be30e6ad30b2403603674c7ec775c4e7fdd31d14 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -45,8 +45,8 @@ if not GITHUB_API_TOKEN: # Write new contributors list to <old_file_name>.final if not os.path.isfile(contributors_file_name): - print "Contributors file %s does not exist!" % contributors_file_name - print "Have you run ./generate-contributors.py yet?" + print("Contributors file %s does not exist!" % contributors_file_name) + print("Have you run ./generate-contributors.py yet?") sys.exit(1) contributors_file = open(contributors_file_name, "r") warnings = [] @@ -58,11 +58,11 @@ if len(sys.argv) > 1: if "--non-interactive" in options: INTERACTIVE_MODE = False if INTERACTIVE_MODE: - print "Running in interactive mode. To disable this, provide the --non-interactive flag." + print("Running in interactive mode. To disable this, provide the --non-interactive flag.") # Setup Github and JIRA clients -jira_options = { "server": JIRA_API_BASE } -jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) +jira_options = {"server": JIRA_API_BASE} +jira_client = JIRA(options=jira_options, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) github_client = Github(GITHUB_API_TOKEN) # Load known author translations that are cached locally @@ -70,7 +70,8 @@ known_translations = {} known_translations_file_name = "known_translations" known_translations_file = open(known_translations_file_name, "r") for line in known_translations_file: - if line.startswith("#"): continue + if line.startswith("#"): + continue [old_name, new_name] = line.strip("\n").split(" - ") known_translations[old_name] = new_name known_translations_file.close() @@ -91,6 +92,8 @@ known_translations_file = open(known_translations_file_name, "a") # (NOT_FOUND, "No assignee found for SPARK-1763") # ] NOT_FOUND = "Not found" + + def generate_candidates(author, issues): candidates = [] # First check for full name of Github user @@ -121,9 +124,11 @@ def generate_candidates(author, issues): user_name = jira_assignee.name display_name = jira_assignee.displayName if display_name: - candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name))) + candidates.append( + (display_name, "Full name of %s assignee %s" % (issue, user_name))) else: - candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name))) + candidates.append( + (NOT_FOUND, "No full name found for %s assignee %s" % (issue, user_name))) else: candidates.append((NOT_FOUND, "No assignee found for %s" % issue)) # Guard against special characters in candidate names @@ -143,18 +148,18 @@ def generate_candidates(author, issues): # select from this list. Additionally, the user may also choose to enter a custom name. # In non-interactive mode, this script picks the first valid author name from the candidates # If no such name exists, the original name is used (without the JIRA numbers). -print "\n========================== Translating contributor list ==========================" +print("\n========================== Translating contributor list ==========================") lines = contributors_file.readlines() contributions = [] for i, line in enumerate(lines): # It is possible that a line in the contributor file only has the github name, e.g. yhuai. # So, we need a strip() to remove the newline. temp_author = line.strip(" * ").split(" -- ")[0].strip() - print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) + print("Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines))) if not temp_author: error_msg = " ERROR: Expected the following format \" * <author> -- <contributions>\"\n" error_msg += " ERROR: Actual = %s" % line - print error_msg + print(error_msg) warnings.append(error_msg) contributions.append(line) continue @@ -175,8 +180,8 @@ for i, line in enumerate(lines): # [3] andrewor14 - Raw Github username # [4] Custom candidate_names = [] - bad_prompts = [] # Prompts that can't actually be selected; print these first. - good_prompts = [] # Prompts that contain valid choices + bad_prompts = [] # Prompts that can't actually be selected; print these first. + good_prompts = [] # Prompts that contain valid choices for candidate, source in candidates: if candidate == NOT_FOUND: bad_prompts.append(" [X] %s" % source) @@ -186,13 +191,16 @@ for i, line in enumerate(lines): good_prompts.append(" [%d] %s - %s" % (index, candidate, source)) raw_index = len(candidate_names) custom_index = len(candidate_names) + 1 - for p in bad_prompts: print p - if bad_prompts: print " ---" - for p in good_prompts: print p + for p in bad_prompts: + print(p) + if bad_prompts: + print(" ---") + for p in good_prompts: + print(p) # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: - print " [%d] %s - Raw Github username" % (raw_index, author) - print " [%d] Custom" % custom_index + print(" [%d] %s - Raw Github username" % (raw_index, author)) + print(" [%d] Custom" % custom_index) response = raw_input(" Your choice: ") last_index = custom_index while not response.isdigit() or int(response) > last_index: @@ -204,8 +212,8 @@ for i, line in enumerate(lines): new_author = candidate_names[response] # In non-interactive mode, just pick the first candidate else: - valid_candidate_names = [name for name, _ in candidates\ - if is_valid_author(name) and name != NOT_FOUND] + valid_candidate_names = [name for name, _ in candidates + if is_valid_author(name) and name != NOT_FOUND] if valid_candidate_names: new_author = valid_candidate_names[0] # Finally, capitalize the author and replace the original one with it @@ -213,17 +221,20 @@ for i, line in enumerate(lines): if is_valid_author(new_author): new_author = capitalize_author(new_author) else: - warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author)) - print " * Replacing %s with %s" % (author, new_author) - # If we are in interactive mode, prompt the user whether we want to remember this new mapping - if INTERACTIVE_MODE and\ - author not in known_translations and\ - yesOrNoPrompt(" Add mapping %s -> %s to known translations file?" % (author, new_author)): + warnings.append( + "Unable to find a valid name %s for author %s" % (author, temp_author)) + print(" * Replacing %s with %s" % (author, new_author)) + # If we are in interactive mode, prompt the user whether we want to remember this new + # mapping + if INTERACTIVE_MODE and \ + author not in known_translations and \ + yesOrNoPrompt( + " Add mapping %s -> %s to known translations file?" % (author, new_author)): known_translations_file.write("%s - %s\n" % (author, new_author)) known_translations_file.flush() line = line.replace(temp_author, author) contributions.append(line) -print "==================================================================================\n" +print("==================================================================================\n") contributors_file.close() known_translations_file.close() @@ -244,12 +255,13 @@ for line in contributions: new_contributors_file.write(line) new_contributors_file.close() -print "Translated contributors list successfully written to %s!" % new_contributors_file_name +print("Translated contributors list successfully written to %s!" % new_contributors_file_name) # Log any warnings encountered in the process if warnings: - print "\n========== Warnings encountered while translating the contributor list ===========" - for w in warnings: print w - print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name - print "==================================================================================\n" - + print("\n========== Warnings encountered while translating the contributor list ===========") + for w in warnings: + print(w) + print("Please manually correct these in the final contributors list at %s." % + new_contributors_file_name) + print("==================================================================================\n") diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index 287f0ca24a7dfec4052cd96819085928a91f7e98..acc9aeabbb9fbd9913e76d8c28ffb0e43968877b 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -27,8 +27,8 @@ import urllib2 try: import jira.client except ImportError: - print "This tool requires the jira-python library" - print "Install using 'sudo pip install jira'" + print("This tool requires the jira-python library") + print("Install using 'sudo pip install jira'") sys.exit(-1) # User facing configs @@ -48,16 +48,19 @@ MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496")) # the state of JIRA's that are tied to PR's we've already looked at. MAX_FILE = ".github-jira-max" + def get_url(url): try: return urllib2.urlopen(url) - except urllib2.HTTPError as e: - print "Unable to fetch URL, exiting: %s" % url + except urllib2.HTTPError: + print("Unable to fetch URL, exiting: %s" % url) sys.exit(-1) + def get_json(urllib_response): return json.load(urllib_response) + # Return a list of (JIRA id, JSON dict) tuples: # e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})} def get_jira_prs(): @@ -65,83 +68,86 @@ def get_jira_prs(): has_next_page = True page_num = 0 while has_next_page: - page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num) - page_json = get_json(page) - - for pull in page_json: - jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title']) - for jira in jiras: - result = result + [(jira, pull)] - - # Check if there is another page - link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0] - if not "next"in link_header: - has_next_page = False - else: - page_num = page_num + 1 + page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num) + page_json = get_json(page) + + for pull in page_json: + jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title']) + for jira in jiras: + result = result + [(jira, pull)] + + # Check if there is another page + link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0] + if "next" not in link_header: + has_next_page = False + else: + page_num += 1 return result + def set_max_pr(max_val): f = open(MAX_FILE, 'w') f.write("%s" % max_val) f.close() - print "Writing largest PR number seen: %s" % max_val + print("Writing largest PR number seen: %s" % max_val) + def get_max_pr(): if os.path.exists(MAX_FILE): result = int(open(MAX_FILE, 'r').read()) - print "Read largest PR number previously seen: %s" % result + print("Read largest PR number previously seen: %s" % result) return result else: return 0 + jira_client = jira.client.JIRA({'server': JIRA_API_BASE}, - basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) + basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) jira_prs = get_jira_prs() previous_max = get_max_pr() -print "Retrieved %s JIRA PR's from Github" % len(jira_prs) +print("Retrieved %s JIRA PR's from Github" % len(jira_prs)) jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] -print "%s PR's remain after excluding visted ones" % len(jira_prs) +print("%s PR's remain after excluding visted ones" % len(jira_prs)) num_updates = 0 considered = [] -for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])): +for issue, pr in sorted(jira_prs, key=lambda kv: int(kv[1]['number'])): if num_updates >= MAX_UPDATES: - break + break pr_num = int(pr['number']) - print "Checking issue %s" % issue + print("Checking issue %s" % issue) considered = considered + [pr_num] url = pr['html_url'] - title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) + title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) try: - existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue)) + existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue)) except: - print "Failure reading JIRA %s (does it exist?)" % issue - print sys.exc_info()[0] - continue + print("Failure reading JIRA %s (does it exist?)" % issue) + print(sys.exc_info()[0]) + continue if url in existing_links: continue - icon = {"title": "Pull request #%s" % pr['number'], - "url16x16": "https://assets-cdn.github.com/favicon.ico"} + icon = {"title": "Pull request #%s" % pr['number'], + "url16x16": "https://assets-cdn.github.com/favicon.ico"} destination = {"title": title, "url": url, "icon": icon} # For all possible fields see: - # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links - # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} + # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links + # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} jira_client.add_remote_link(issue, destination) - + comment = "User '%s' has created a pull request for this issue:" % pr['user']['login'] - comment = comment + ("\n%s" % pr['html_url']) + comment += "\n%s" % pr['html_url'] if pr_num >= MIN_COMMENT_PR: jira_client.add_comment(issue, comment) - - print "Added link %s <-> PR #%s" % (issue, pr['number']) - num_updates = num_updates + 1 + + print("Added link %s <-> PR #%s" % (issue, pr['number'])) + num_updates += 1 if len(considered) > 0: set_max_pr(max(considered)) diff --git a/dev/lint-python b/dev/lint-python index 3f878c2dad6b1154d8a8a8ccedbeaaf980f7c88a..c6f3fbfab84ed195f4612b8abe75fc6434f89560 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -19,10 +19,8 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" -PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport" -# TODO: fix pep8 errors with the rest of the Python scripts under dev -PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/*.py ./dev/run-tests-jenkins.py" -PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/pip-sanity-check.py" +# Exclude auto-geneated configuration file. +PATHS_TO_CHECK="$( cd "$SPARK_ROOT_DIR" && find . -name "*.py" -not -path "*python/docs/conf.py" )" PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt" PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt" PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt" diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index 5ab285eae99b79de44701ec020cffbc7835eeceb..4bacb385184c6c332cda6b2f861e89a32637bae8 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -70,22 +70,22 @@ def get_json(url): return json.load(urllib2.urlopen(request)) except urllib2.HTTPError as e: if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0': - print "Exceeded the GitHub API rate limit; see the instructions in " + \ - "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \ - "GitHub requests." + print("Exceeded the GitHub API rate limit; see the instructions in " + + "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + + "GitHub requests.") else: - print "Unable to fetch URL, exiting: %s" % url + print("Unable to fetch URL, exiting: %s" % url) sys.exit(-1) def fail(msg): - print msg + print(msg) clean_up() sys.exit(-1) def run_cmd(cmd): - print cmd + print(cmd) if isinstance(cmd, list): return subprocess.check_output(cmd) else: @@ -97,14 +97,15 @@ def continue_maybe(prompt): if result.lower() != "y": fail("Okay, exiting") + def clean_up(): - print "Restoring head pointer to %s" % original_head + print("Restoring head pointer to %s" % original_head) run_cmd("git checkout %s" % original_head) branches = run_cmd("git branch").replace(" ", "").split("\n") for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): - print "Deleting local branch %s" % branch + print("Deleting local branch %s" % branch) run_cmd("git branch -D %s" % branch) @@ -246,9 +247,9 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""): if cur_status == "Resolved" or cur_status == "Closed": fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status)) - print ("=== JIRA %s ===" % jira_id) - print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( - cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) + print("=== JIRA %s ===" % jira_id) + print("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % + (cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) versions = asf_jira.project_versions("SPARK") versions = sorted(versions, key=lambda x: x.name, reverse=True) @@ -282,10 +283,10 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""): resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0] asf_jira.transition_issue( - jira_id, resolve["id"], fixVersions = jira_fix_versions, - comment = comment, resolution = {'id': resolution.raw['id']}) + jira_id, resolve["id"], fixVersions=jira_fix_versions, + comment=comment, resolution={'id': resolution.raw['id']}) - print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions) + print("Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions)) def resolve_jira_issues(title, merge_branches, comment): @@ -300,23 +301,29 @@ def resolve_jira_issues(title, merge_branches, comment): def standardize_jira_ref(text): """ Standardize the [SPARK-XXXXX] [MODULE] prefix - Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue" + Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to + "[SPARK-XXX][MLLIB] Issue" - >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful") + >>> standardize_jira_ref( + ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful") '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful' - >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests") + >>> standardize_jira_ref( + ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests") '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests' >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key") '[SPARK-5954][MLLIB] Top by key' >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl") '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl' - >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.") + >>> standardize_jira_ref( + ... "SPARK-1094 Support MiMa for reporting binary compatibility accross versions.") '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.' >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark") '[SPARK-1146][WIP] Vagrant support for Spark' - >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...") + >>> standardize_jira_ref( + ... "SPARK-1032. If Yarn app fails before registering, app master stays aroun...") '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...' - >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.") + >>> standardize_jira_ref( + ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.") '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.' >>> standardize_jira_ref("Additional information for users building from source code") 'Additional information for users building from source code' @@ -350,7 +357,8 @@ def standardize_jira_ref(text): # Assemble full text (JIRA ref(s), module(s), remaining text) clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip() - # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included + # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were + # included clean_text = re.sub(r'\s+', ' ', clean_text.strip()) return clean_text @@ -385,17 +393,17 @@ def main(): # Decide whether to use the modified title or not modified_title = standardize_jira_ref(pr["title"]) if modified_title != pr["title"]: - print "I've re-written the title as follows to match the standard format:" - print "Original: %s" % pr["title"] - print "Modified: %s" % modified_title + print("I've re-written the title as follows to match the standard format:") + print("Original: %s" % pr["title"]) + print("Modified: %s" % modified_title) result = raw_input("Would you like to use the modified title? (y/n): ") if result.lower() == "y": title = modified_title - print "Using modified title:" + print("Using modified title:") else: title = pr["title"] - print "Using original title:" - print title + print("Using original title:") + print(title) else: title = pr["title"] @@ -414,13 +422,13 @@ def main(): merge_hash = merge_commits[0]["commit_id"] message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"] - print "Pull request %s has already been merged, assuming you want to backport" % pr_num + print("Pull request %s has already been merged, assuming you want to backport" % pr_num) commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify', - "%s^{commit}" % merge_hash]).strip() != "" + "%s^{commit}" % merge_hash]).strip() != "" if not commit_is_downloaded: fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) - print "Found commit %s:\n%s" % (merge_hash, message) + print("Found commit %s:\n%s" % (merge_hash, message)) cherry_pick(pr_num, merge_hash, latest_branch) sys.exit(0) @@ -429,9 +437,9 @@ def main(): "Continue? (experts only!)" continue_maybe(msg) - print ("\n=== Pull Request #%s ===" % pr_num) - print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( - title, pr_repo_desc, target_ref, url)) + print("\n=== Pull Request #%s ===" % pr_num) + print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % + (title, pr_repo_desc, target_ref, url)) continue_maybe("Proceed with merging pull request #%s?" % pr_num) merged_refs = [target_ref] @@ -445,14 +453,15 @@ def main(): if JIRA_IMPORTED: if JIRA_USERNAME and JIRA_PASSWORD: continue_maybe("Would you like to update an associated JIRA?") - jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num) + jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % \ + (pr_num, GITHUB_BASE, pr_num) resolve_jira_issues(title, merged_refs, jira_comment) else: - print "JIRA_USERNAME and JIRA_PASSWORD not set" - print "Exiting without trying to close the associated JIRA." + print("JIRA_USERNAME and JIRA_PASSWORD not set") + print("Exiting without trying to close the associated JIRA.") else: - print "Could not find jira-python library. Run 'sudo pip install jira' to install." - print "Exiting without trying to close the associated JIRA." + print("Could not find jira-python library. Run 'sudo pip install jira' to install.") + print("Exiting without trying to close the associated JIRA.") if __name__ == "__main__": import doctest diff --git a/examples/src/main/python/mllib/decision_tree_classification_example.py b/examples/src/main/python/mllib/decision_tree_classification_example.py index 1b529768b6c623b83b70e5e03f7653201f5df50e..7eecf500584ad81655852604e2784ad9df5f1be7 100644 --- a/examples/src/main/python/mllib/decision_tree_classification_example.py +++ b/examples/src/main/python/mllib/decision_tree_classification_example.py @@ -44,7 +44,8 @@ if __name__ == "__main__": # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) - testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) + testErr = labelsAndPredictions.filter( + lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) diff --git a/examples/src/main/python/mllib/decision_tree_regression_example.py b/examples/src/main/python/mllib/decision_tree_regression_example.py index cf518eac67e8119135c01728afc9b37f998f883e..acf9e25fdf31c03c3d1315b7fb1373506536276b 100644 --- a/examples/src/main/python/mllib/decision_tree_regression_example.py +++ b/examples/src/main/python/mllib/decision_tree_regression_example.py @@ -44,7 +44,7 @@ if __name__ == "__main__": # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) - testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ + testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') diff --git a/examples/src/main/python/mllib/gradient_boosting_classification_example.py b/examples/src/main/python/mllib/gradient_boosting_classification_example.py index b204cd1b31c868e4149ddda9b0a946a0bce041c8..65a03572be9b512169f9b89eb767d160f7a0144e 100644 --- a/examples/src/main/python/mllib/gradient_boosting_classification_example.py +++ b/examples/src/main/python/mllib/gradient_boosting_classification_example.py @@ -43,7 +43,8 @@ if __name__ == "__main__": # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) - testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) + testErr = labelsAndPredictions.filter( + lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) diff --git a/examples/src/main/python/mllib/gradient_boosting_regression_example.py b/examples/src/main/python/mllib/gradient_boosting_regression_example.py index 758e224a9e21d4a4a6b393381b81110a4b806832..877f8ab461ccdf8fa718243d089d80e8fad6a3aa 100644 --- a/examples/src/main/python/mllib/gradient_boosting_regression_example.py +++ b/examples/src/main/python/mllib/gradient_boosting_regression_example.py @@ -43,7 +43,7 @@ if __name__ == "__main__": # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) - testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ + testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') diff --git a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py index 6fbaeff0cd5a0e3bae3e2288220d596622184e5b..6744463d40ef19bed04ead56b36a99a58d77e551 100644 --- a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py +++ b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py @@ -44,7 +44,7 @@ if __name__ == "__main__": # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ - .map(lambda (v, p): (v - p)**2) \ + .map(lambda vp: (vp[0] - vp[1])**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) diff --git a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py index e030b74ba6b15e3c2954d3412f56bd3ff89386ac..c9b768b3147d25e866f9cf458add5c86e2663c40 100644 --- a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py +++ b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py @@ -44,7 +44,7 @@ if __name__ == "__main__": # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) - trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) + trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index 749353b20eb3ee7e5b1e75549203fba5fc97eda2..a29fcccac5bfc3cbf643ab566feb15dff680975b 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -50,7 +50,7 @@ if __name__ == "__main__": # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) - accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() + accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count() print('model accuracy {}'.format(accuracy)) # Save and load model @@ -59,7 +59,7 @@ if __name__ == "__main__": model.save(sc, output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) - accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() + accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count() print('sameModel accuracy {}'.format(accuracy)) # $example off$ diff --git a/examples/src/main/python/mllib/random_forest_classification_example.py b/examples/src/main/python/mllib/random_forest_classification_example.py index 9e5a8dcaabb0e8c86399131b63beb3a31b5ea226..5ac67520daee03929f11c3f55bbcc6839a784db1 100644 --- a/examples/src/main/python/mllib/random_forest_classification_example.py +++ b/examples/src/main/python/mllib/random_forest_classification_example.py @@ -45,7 +45,8 @@ if __name__ == "__main__": # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) - testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) + testErr = labelsAndPredictions.filter( + lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) diff --git a/examples/src/main/python/mllib/random_forest_regression_example.py b/examples/src/main/python/mllib/random_forest_regression_example.py index 2e1be34c1a29a37335a44039f92aa63fb70c888b..7e986a0d307f04248844cbc018e3c9785a4a4498 100644 --- a/examples/src/main/python/mllib/random_forest_regression_example.py +++ b/examples/src/main/python/mllib/random_forest_regression_example.py @@ -45,7 +45,7 @@ if __name__ == "__main__": # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) - testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ + testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') diff --git a/examples/src/main/python/mllib/svm_with_sgd_example.py b/examples/src/main/python/mllib/svm_with_sgd_example.py index 309ab09cc375a9e123e39324a7500f34c8224c7c..24b8f431e059e7cfd69718f600ca1f0ef591838c 100644 --- a/examples/src/main/python/mllib/svm_with_sgd_example.py +++ b/examples/src/main/python/mllib/svm_with_sgd_example.py @@ -38,7 +38,7 @@ if __name__ == "__main__": # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) - trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) + trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py index b85517dfdd913b256d61bb3ac380ddda7eb0443e..b309d9fad33f56706507939e0d156f672ea56628 100644 --- a/examples/src/main/python/streaming/network_wordjoinsentiments.py +++ b/examples/src/main/python/streaming/network_wordjoinsentiments.py @@ -67,8 +67,8 @@ if __name__ == "__main__": # with the static RDD inside the transform() method and then multiplying # the frequency of the words by its sentiment value happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \ - .map(lambda (word, tuple): (word, float(tuple[0]) * tuple[1])) \ - .map(lambda (word, happiness): (happiness, word)) \ + .map(lambda word_tuples: (word_tuples[0], float(word_tuples[1][0]) * word_tuples[1][1])) \ + .map(lambda word_happiness: (word_happiness[1], word_happiness[0])) \ .transform(lambda rdd: rdd.sortByKey(False)) happiest_words.foreachRDD(print_happiest_words) diff --git a/python/docs/epytext.py b/python/docs/epytext.py index e884d5e6b19c74609bf73b00a69a35e4e51b244f..4bbbf650a13e97e56d99b42707e8b2a0c52245e6 100644 --- a/python/docs/epytext.py +++ b/python/docs/epytext.py @@ -9,6 +9,7 @@ RULES = ( ('pyspark.rdd.RDD', 'RDD'), ) + def _convert_epytext(line): """ >>> _convert_epytext("L{A}") @@ -19,9 +20,11 @@ def _convert_epytext(line): line = re.sub(p, sub, line) return line + def _process_docstring(app, what, name, obj, options, lines): for i in range(len(lines)): lines[i] = _convert_epytext(lines[i]) + def setup(app): app.connect("autodoc-process-docstring", _process_docstring) diff --git a/sql/hive/src/test/resources/data/scripts/cat.py b/sql/hive/src/test/resources/data/scripts/cat.py index 2395b2cdeb391bf58555c6552d57bae867002dba..aea0362f899fad8de2d0355c908eec9cdf0bef7b 100644 --- a/sql/hive/src/test/resources/data/scripts/cat.py +++ b/sql/hive/src/test/resources/data/scripts/cat.py @@ -16,14 +16,14 @@ # specific language governing permissions and limitations # under the License. # -import sys, re -import datetime +from __future__ import print_function +import sys import os -table_name=None -if os.environ.has_key('hive_streaming_tablename'): - table_name=os.environ['hive_streaming_tablename'] +table_name = None +if os.environ in 'hive_streaming_tablename': + table_name = os.environ['hive_streaming_tablename'] for line in sys.stdin: - print line - print >> sys.stderr, "dummy" + print(line) + print("dummy", file=sys.stderr) diff --git a/sql/hive/src/test/resources/data/scripts/cat_error.py b/sql/hive/src/test/resources/data/scripts/cat_error.py index 9642efec8ecb4b7febf94b63dd5dc87574e27f9c..dc1bccece947e6dee8ef08b0fe50886f2f4077b4 100644 --- a/sql/hive/src/test/resources/data/scripts/cat_error.py +++ b/sql/hive/src/test/resources/data/scripts/cat_error.py @@ -19,6 +19,6 @@ import sys for line in sys.stdin: - print line + print(line) sys.exit(1) diff --git a/sql/hive/src/test/resources/data/scripts/doubleescapedtab.py b/sql/hive/src/test/resources/data/scripts/doubleescapedtab.py index d373067baed2c4d9558cce465961c960a3e62f1d..ff5a8b82f429ae7ab967c69c176d988260e44f45 100644 --- a/sql/hive/src/test/resources/data/scripts/doubleescapedtab.py +++ b/sql/hive/src/test/resources/data/scripts/doubleescapedtab.py @@ -19,6 +19,5 @@ import sys for line in sys.stdin: - print "1\\\\\\t2" - print "1\\\\\\\\t2" - + print("1\\\\\\t2") + print("1\\\\\\\\t2") diff --git a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py index c96c9e529bbb187c11e591c2e51aa1655731ad46..341a1b40e07afd98288c8703ec669f2167646cf7 100644 --- a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py +++ b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py @@ -19,9 +19,9 @@ import sys for i in xrange(50): - for j in xrange(5): - for k in xrange(20022): - print 20000 * i + k + for j in xrange(5): + for k in xrange(20022): + print(20000 * i + k) for line in sys.stdin: - pass + pass diff --git a/sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py b/sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py index 475928a2430f64ecb8fb80ac9c1734f1378b2a4c..894cbdd1395150cfb0ce2c49b0601b57e5f55010 100644 --- a/sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py +++ b/sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py @@ -19,5 +19,4 @@ import sys for line in sys.stdin: - print "1\\\\r2" - + print("1\\\\r2") diff --git a/sql/hive/src/test/resources/data/scripts/escapednewline.py b/sql/hive/src/test/resources/data/scripts/escapednewline.py index 0d5751454bed7fc22259e330d456bddc9ebda636..ff47fe57347060210780c76bb12f431c528c0cb9 100644 --- a/sql/hive/src/test/resources/data/scripts/escapednewline.py +++ b/sql/hive/src/test/resources/data/scripts/escapednewline.py @@ -19,5 +19,4 @@ import sys for line in sys.stdin: - print "1\\\\n2" - + print("1\\\\n2") diff --git a/sql/hive/src/test/resources/data/scripts/escapedtab.py b/sql/hive/src/test/resources/data/scripts/escapedtab.py index 549c91e4446324b8a4606dca96b14bcf5b7daf70..d9743eec5642039a23e53a7551e224dd1790661e 100644 --- a/sql/hive/src/test/resources/data/scripts/escapedtab.py +++ b/sql/hive/src/test/resources/data/scripts/escapedtab.py @@ -19,5 +19,4 @@ import sys for line in sys.stdin: - print "1\\\\t2" - + print("1\\\\t2") diff --git a/sql/hive/src/test/resources/data/scripts/input20_script.py b/sql/hive/src/test/resources/data/scripts/input20_script.py index 40e3683dc3d363d77e2ca78b465a7b82b1cbcc98..08669cbf0a1a4fee1a69c43c0914257535f9c50f 100644 --- a/sql/hive/src/test/resources/data/scripts/input20_script.py +++ b/sql/hive/src/test/resources/data/scripts/input20_script.py @@ -21,10 +21,10 @@ import re line = sys.stdin.readline() x = 1 while line: - tem = sys.stdin.readline() - if line == tem: - x = x + 1 - else: - print str(x).strip()+'\t'+re.sub('\t','_',line.strip()) - line = tem - x = 1 \ No newline at end of file + tem = sys.stdin.readline() + if line == tem: + x += 1 + else: + print(str(x).strip()+'\t'+re.sub('\t', '_', line.strip())) + line = tem + x = 1 diff --git a/sql/hive/src/test/resources/data/scripts/newline.py b/sql/hive/src/test/resources/data/scripts/newline.py index 6500d900dd8ab3718c07c7a3deba2d29f1ba4d46..59c313fcc29f0dd61d71a426617da4bcbb866a21 100644 --- a/sql/hive/src/test/resources/data/scripts/newline.py +++ b/sql/hive/src/test/resources/data/scripts/newline.py @@ -19,6 +19,6 @@ import sys for line in sys.stdin: - print "1\\n2" - print "1\\r2" - print "1\\t2" + print("1\\n2") + print("1\\r2") + print("1\\t2")