import argparse import csv import dataclasses import os import pprint import re from collections import defaultdict from pathlib import Path from typing import List import common from common import ( features_to_dict, frontend_categories, get_commit_data_cache, run, topics, ) """ Example Usages Create a new commitlist for consumption by categorize.py. Said commitlist contains commits between v1.5.0 and f5bc91f851. python commitlist.py --create-new tags/v1.5.0 f5bc91f851 Update the existing commitlist to commit bfcb687b9c. python commitlist.py --update-to bfcb687b9c """ # Increase the allowed size of a CSV field to 1mil bytes for long files changed csv.field_size_limit(1000000) @dataclasses.dataclass(frozen=False) class Commit: commit_hash: str category: str topic: str title: str files_changed: str pr_link: str author: str # This is not a list so that it is easier to put in a spreadsheet accepter_1: str accepter_2: str accepter_3: str merge_into: str = None def __repr__(self): return ( f"Commit({self.commit_hash}, {self.category}, {self.topic}, {self.title})" ) commit_fields = tuple(f.name for f in dataclasses.fields(Commit)) class CommitList: # NB: Private ctor. Use `from_existing` or `create_new`. def __init__(self, path: str, commits: List[Commit]): self.path = path self.commits = commits @staticmethod def from_existing(path): commits = CommitList.read_from_disk(path) return CommitList(path, commits) @staticmethod def create_new(path, base_version, new_version): if os.path.exists(path): raise ValueError( "Attempted to create a new commitlist but one exists already!" ) commits = CommitList.get_commits_between(base_version, new_version) return CommitList(path, commits) @staticmethod def read_from_disk(path) -> List[Commit]: with open(path) as csvfile: reader = csv.DictReader(csvfile) rows = [] for row in reader: if row.get("new_title", "") != "": row["title"] = row["new_title"] filtered_rows = {k: row.get(k, "") for k in commit_fields} rows.append(Commit(**filtered_rows)) return rows def write_result(self): self.write_to_disk_static(self.path, self.commits) @staticmethod def write_to_disk_static(path, commit_list): os.makedirs(Path(path).parent, exist_ok=True) with open(path, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(commit_fields) for commit in commit_list: writer.writerow(dataclasses.astuple(commit)) @staticmethod def keywordInFile(file, keywords): for key in keywords: if key in file: return True return False @staticmethod def gen_commit(commit_hash): feature_item = get_commit_data_cache().get(commit_hash) features = features_to_dict(feature_item) category, topic = CommitList.categorize(features) a1, a2, a3 = (features["accepters"] + ("", "", ""))[:3] if features["pr_number"] is not None: pr_link = f"https://github.com/pytorch/pytorch/pull/{features['pr_number']}" else: pr_link = None files_changed_str = " ".join(features["files_changed"]) return Commit( commit_hash, category, topic, features["title"], files_changed_str, pr_link, features["author"], a1, a2, a3, ) @staticmethod def category_remapper(category: str) -> str: if category in frontend_categories: category = category + "_frontend" return category if category == "Meta API": category = "composability" return category if category in common.quantization.categories: category = common.quantization.name return category if category in common.distributed.categories: category = common.distributed.name return category return category @staticmethod def bracket_category_matcher(title: str): """Categorize a commit based on the presence of a bracketed category in the title. Args: title (str): title to seaarch Returns: optional[str] """ pairs = [ ("[dynamo]", "dynamo"), ("[torchdynamo]", "dynamo"), ("[torchinductor]", "inductor"), ("[inductor]", "inductor"), ("[codemod", "skip"), ("[profiler]", "profiler"), ("[functorch]", "functorch"), ("[autograd]", "autograd_frontend"), ("[quantization]", "quantization"), ("[nn]", "nn_frontend"), ("[complex]", "complex_frontend"), ("[mps]", "mps"), ("[optimizer]", "optimizer_frontend"), ("[xla]", "xla"), ] title_lower = title.lower() for bracket, category in pairs: if bracket in title_lower: return category return None @staticmethod def categorize(features): title = features["title"] labels = features["labels"] category = "Uncategorized" topic = "Untopiced" # Revert commits are merged directly to master with no associated PR number if features["pr_number"] is None: if title.startswith("Revert"): return "skip", topic # We ask contributors to label their PR's appropriately # when they're first landed. # Check if the labels are there first. already_categorized = already_topiced = False for label in labels: if label.startswith("release notes: "): category = label.split("release notes: ", 1)[1] category = CommitList.category_remapper(category) already_categorized = True if label.startswith("topic: "): topic = label.split("topic: ", 1)[1] already_topiced = True if already_categorized and already_topiced: return category, topic # update this to check if each file starts with caffe2 if "caffe2" in title: return "caffe2", topic if "Reverted" in labels: return "skip", topic if "module: deprecation" in labels: topic = "deprecation" found_bracket_category = CommitList.bracket_category_matcher(title) if found_bracket_category: return found_bracket_category, topic files_changed = features["files_changed"] for file in files_changed: file_lowercase = file.lower() if CommitList.keywordInFile( file, [ "docker/", ".circleci", ".github", ".jenkins", ".ci", ".azure_pipelines", ], ): category = "releng" break # datapipe(s), torch/utils/data, test_{dataloader, datapipe} if CommitList.keywordInFile( file, ["torch/utils/data", "test_dataloader", "test_datapipe"] ): category = "dataloader_frontend" break if CommitList.keywordInFile(file, ["torch/csrc/api", "test/cpp/api"]): category = "cpp_frontend" break if CommitList.keywordInFile(file, ["distributed", "c10d"]): category = "distributed" break if "vulkan" in file_lowercase: category = "vulkan" break if "Foreach" in file_lowercase: category = "foreach_frontend" break if "onnx" in file_lowercase: category = "onnx" break if CommitList.keywordInFile(file, ["torch/fx", "test_fx"]): category = "fx" break if CommitList.keywordInFile(file, ["torch/ao", "test/ao"]): category = common.quantization.name break # torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable} if CommitList.keywordInFile( file, [ "torch/quantization", "test/quantization", "aten/src/ATen/native/quantized", "torch/nn/quantiz", ], ): category = common.quantization.name break if CommitList.keywordInFile(file, ["torch/package", "test/package"]): category = "package" break if CommitList.keywordInFile( file, [ "torch/csrc/jit/mobile", "aten/src/ATen/native/metal", "test/mobile", "torch/backends/_nnapi/", "test/test_nnapi.py", ], ): category = "mobile" break if CommitList.keywordInFile( file, [ "aten/src/ATen/native/LinearAlgebra.cpp", "test/test_linalg.py", "torch/linalg", ], ): category = "linalg_frontend" break if CommitList.keywordInFile( file, [ "torch/sparse", "aten/src/ATen/native/sparse", "torch/_masked/__init__.py", ], ): category = "sparse_frontend" break if CommitList.keywordInFile(file, ["tools/autograd"]): category = "autograd_frontend" break if CommitList.keywordInFile( file, [ "test/test_nn.py", "test/test_module.py", "torch/nn/modules", "torch/nn/functional.py", ], ): category = "nn_frontend" break if CommitList.keywordInFile(file, ["torch/csrc/jit", "torch/jit"]): category = "jit" break if CommitList.keywordInFile( file, [ "torch/_meta_registrations.py", "torch/_decomp", "torch/_prims", "torch/_refs", ], ): category = "composability" break if CommitList.keywordInFile(file, ["torch/_dynamo"]): category = "dynamo" break if CommitList.keywordInFile(file, ["torch/_inductor"]): category = "inductor" break else: # Below are some extra quick checks that aren't necessarily file-path related, # but I found that to catch a decent number of extra commits. if len(files_changed) > 0 and all( f_name.endswith((".cu", ".cuh")) for f_name in files_changed ): category = "cuda" elif "[PyTorch Edge]" in title: category = "mobile" elif ( len(files_changed) == 1 and "torch/testing/_internal/common_methods_invocations.py" in files_changed[0] ): # when this is the only file changed, it's almost always an OpInfo change. category = "python_frontend" elif len(files_changed) == 1 and "torch/_torch_docs.py" in files_changed[0]: # individual torch_docs changes are usually for python ops category = "python_frontend" # If we couldn't find a category but the topic is not user facing we can skip these: if category == "Uncategorized" and topic == "not user facing": category = "skip" return category, topic @staticmethod def get_commits_between(base_version, new_version): cmd = f"git merge-base {base_version} {new_version}" rc, merge_base, _ = run(cmd) assert rc == 0 # Returns a list of something like # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555) cmd = f"git log --reverse --oneline {merge_base}..{new_version}" rc, commits, _ = run(cmd) assert rc == 0 log_lines = commits.split("\n") hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines]) return [CommitList.gen_commit(commit_hash) for commit_hash in hashes] def filter(self, *, category=None, topic=None): commits = self.commits if category is not None: commits = [commit for commit in commits if commit.category == category] if topic is not None: commits = [commit for commit in commits if commit.topic == topic] return commits def update_to(self, new_version): last_hash = self.commits[-1].commit_hash new_commits = CommitList.get_commits_between(last_hash, new_version) self.commits += new_commits def stat(self): counts = defaultdict(lambda: defaultdict(int)) for commit in self.commits: counts[commit.category][commit.topic] += 1 return counts def create_new(path, base_version, new_version): commits = CommitList.create_new(path, base_version, new_version) commits.write_result() def update_existing(path, new_version): commits = CommitList.from_existing(path) commits.update_to(new_version) commits.write_result() def rerun_with_new_filters(path): current_commits = CommitList.from_existing(path) for i, commit in enumerate(current_commits.commits): current_category = commit.category if ( current_category == "Uncategorized" or current_category not in common.categories ): feature_item = get_commit_data_cache().get(commit.commit_hash) features = features_to_dict(feature_item) category, topic = CommitList.categorize(features) current_commits.commits[i] = dataclasses.replace( commit, category=category, topic=topic ) current_commits.write_result() def get_hash_or_pr_url(commit: Commit): # cdc = get_commit_data_cache() pr_link = commit.pr_link if pr_link is None: return commit.commit_hash else: regex = r"https://github.com/pytorch/pytorch/pull/([0-9]+)" matches = re.findall(regex, pr_link) if len(matches) == 0: return commit.commit_hash return f"[#{matches[0]}]({pr_link})" def to_markdown(commit_list: CommitList, category): def cleanup_title(commit): match = re.match(r"(.*) \(#\d+\)", commit.title) if match is None: return commit.title return match.group(1) merge_mapping = defaultdict(list) for commit in commit_list.commits: if commit.merge_into: merge_mapping[commit.merge_into].append(commit) cdc = get_commit_data_cache() lines = [f"\n## {category}\n"] for topic in topics: lines.append(f"### {topic}\n") commits = commit_list.filter(category=category, topic=topic) if "_" in topic: commits.extend( commit_list.filter(category=category, topic=topic.replace("_", " ")) ) if " " in topic: commits.extend( commit_list.filter(category=category, topic=topic.replace(" ", "_")) ) for commit in commits: if commit.merge_into: continue all_related_commits = merge_mapping[commit.commit_hash] + [commit] commit_list_md = ", ".join( get_hash_or_pr_url(c) for c in all_related_commits ) result = f"- {cleanup_title(commit)} ({commit_list_md})\n" lines.append(result) return lines def get_markdown_header(category): header = f""" # Release Notes worksheet {category} The main goal of this process is to rephrase all the commit messages below to make them **clear and easy to read** by the end user. You should follow the following instructions to do so: * **Please clean up and format commit titles to be readable by the general PyTorch user.** Make sure you're [following the guidance here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit)! Your resulting notes must be consistent and easy to read. * Please sort commits into the following categories (you should not rename the categories!), I tried to pre-sort these to ease your work, feel free to move commits around if the current categorization is not good. * Anything that is not public facing needs to be removed. * If anything is miscategorized/belongs to another domain, move it to `miscategorized.md`. * Please scan through `miscategorized.md` and handle any commits that belong within your domain according to these instructions. * We place a lot of emphasis on the “BC-breaking” and “deprecation” sections. Those should be where the most effort goes in. The “improvements” and “bug fixes” for Python API should be nice as well. * Once you are finished, move this very file from `todo/` to `done/` and submit a pull request. The categories below are as follows: * BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code [BC-Guidelines](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit#heading=h.a9htwgvvec1m). * Deprecations: All commits introducing deprecation. Each commit should include a small example explaining what should be done to update user code. * new_features: All commits introducing a new feature (new functions, new submodule, new supported platform etc) * improvements: All commits providing improvements to existing feature should be here (new backend for a function, new argument, better numerical stability) * bug fixes: All commits that fix bugs and behaviors that do not match the documentation * performance: All commits that are added mainly for performance (we separate this from improvements above to make it easier for users to look for it) * documentation: All commits that add/update documentation * Developers: All commits that are not end-user facing but still impact people that compile from source, develop into pytorch, extend pytorch, etc * not user facing: All commits that are not public end-user facing and hence should be dropped from the release notes """ return [header] def main(): parser = argparse.ArgumentParser(description="Tool to create a commit list") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--create-new", "--create_new", nargs=2) group.add_argument("--update-to", "--update_to") # I found this flag useful when experimenting with adding new auto-categorizing filters. # After running commitlist.py the first time, if you add any new filters in this file, # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file, # but only affect the rows that were previously marked as "Uncategorized" group.add_argument( "--rerun-with-new-filters", "--rerun_with_new_filters", action="store_true" ) group.add_argument("--stat", action="store_true") group.add_argument("--export-markdown", "--export_markdown", action="store_true") group.add_argument( "--export-csv-categories", "--export_csv_categories", action="store_true" ) parser.add_argument("--path", default="results/commitlist.csv") args = parser.parse_args() if args.create_new: create_new(args.path, args.create_new[0], args.create_new[1]) print( "Finished creating new commit list. Results have been saved to results/commitlist.csv" ) return if args.update_to: update_existing(args.path, args.update_to) return if args.rerun_with_new_filters: rerun_with_new_filters(args.path) return if args.stat: commits = CommitList.from_existing(args.path) stats = commits.stat() pprint.pprint(stats) return if args.export_csv_categories: commits = CommitList.from_existing(args.path) categories = list(commits.stat().keys()) for category in categories: print(f"Exporting {category}...") filename = f"results/export/result_{category}.csv" CommitList.write_to_disk_static(filename, commits.filter(category=category)) return if args.export_markdown: commits = CommitList.from_existing(args.path) categories = list(commits.stat().keys()) for category in categories: print(f"Exporting {category}...") lines = get_markdown_header(category) lines += to_markdown(commits, category) filename = f"results/export/result_{category}.md" os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, "w") as f: f.writelines(lines) return raise AssertionError if __name__ == "__main__": main()