Add release note scripts (#47360)

Summary: First commit contains the initial code from Richard's branch. Second commit are the changes that I made during the writing process Third commit is the update to support category/topic pair for each commit Pull Request resolved: https://github.com/pytorch/pytorch/pull/47360 Reviewed By: ejguan Differential Revision: D24741003 Pulled By: albanD fbshipit-source-id: d0fcc6765968dc1732d8a515688d11372c7e653d
2025-12-06 12:20:52 +01:00 · 2020-11-05 06:40:59 -08:00 · 2020-11-05 06:40:59 -08:00 · 68954fe897
commit 68954fe897
parent a4ba018e57
5 changed files with 557 additions and 0 deletions
--- a/scripts/release_notes/categorize.py
+++ b/scripts/release_notes/categorize.py
@ -0,0 +1,134 @@
+import json
+import argparse
+import os
+import textwrap
+from common import dict_to_features, categories, topics, get_features, CommitDataCache
+from commitlist import CommitList
+
+class Categorizer:
+    def __init__(self, path, category='Uncategorized'):
+        self.cache = CommitDataCache()
+        self.commits = CommitList.from_existing(path)
+
+        # Special categories: 'Uncategorized'
+        # All other categories must be real
+        self.category = category
+
+    def categorize(self):
+        commits = self.commits.filter(self.category)
+        i = 0
+        while i < len(commits):
+            cur_commit = commits[i]
+            next_commit = commits[i + 1] if i + 1 < len(commits) else None
+            jump_to = self.handle_commit(cur_commit, i + 1, len(commits), commits)
+
+            # Increment counter
+            if jump_to is not None:
+                i = jump_to
+            elif next_commit is None:
+                i = len(commits)
+            else:
+                i = commits.index(next_commit)
+
+    def features(self, commit):
+        return self.cache.get(commit.commit_hash)
+
+    def potential_reverts_of(self, commit, commits):
+        if 'Updating submodules' in commit.title:
+            return []
+        index = commits.index(commit)
+        # -8 to remove the (#35011)
+        cleaned_title = commit.title[:-10]
+        # NB: the index + 2 is sketch
+        return {(index + 2 + delta): cand for delta, cand in enumerate(commits[index + 1:])
+                if cleaned_title in cand.title and
+                commit.commit_hash != cand.commit_hash}
+
+    def handle_commit(self, commit, i, total, commits):
+        potential_reverts = self.potential_reverts_of(commit, commits)
+        if potential_reverts:
+            potential_reverts = f'!!!POTENTIAL REVERTS!!!: {potential_reverts}'
+        else:
+            potential_reverts = ""
+
+        features = self.features(commit)
+
+        breaking_alarm = ""
+        if 'topic: bc-breaking' in features.labels:
+            breaking_alarm += "!!!!!! BC BREAKING !!!!!!"
+
+        if 'module: deprecation' in features.labels:
+            breaking_alarm += "!!!!!! DEPRECATION !!!!!!"
+
+        os.system('clear')
+        view = textwrap.dedent(f'''\
+[{i}/{total}]
+================================================================================
+{features.title}
+
+{features.body}
+
+Files changed: {features.files_changed}
+
+Labels: {features.labels}
+
+{potential_reverts} {breaking_alarm}
+
+Current category: {commit.category}
+
+Select from: {', '.join(categories)}
+
+        ''')
+        print(view)
+        cat_choice = None
+        while cat_choice is None:
+            value = input('category> ').strip()
+            if len(value) == 0:
+                cat_choice = commit.category
+                continue
+            choices = [cat for cat in categories
+                       if cat.startswith(value)]
+            if len(choices) != 1:
+                print(f'Possible matches: {choices}, try again')
+                continue
+            cat_choice = choices[0]
+        print(f'\nSelected: {cat_choice}')
+        print(f'\nCurrent topic: {commit.topic}')
+        print(f'''Select from: {', '.join(topics)}''')
+        topic_choice = None
+        while topic_choice is None:
+            value = input('topic> ').strip()
+            if len(value) == 0:
+                topic_choice = commit.topic
+                continue
+            choices = [cat for cat in topics
+                       if cat.startswith(value)]
+            if len(choices) != 1:
+                print(f'Possible matches: {choices}, try again')
+                continue
+            topic_choice = choices[0]
+        print(f'\nSelected: {topic_choice}')
+        self.update_commit(commit, cat_choice, topic_choice)
+        return None
+
+    def update_commit(self, commit, category, topic):
+        assert category in categories
+        assert topic in topics
+        commit.category = category
+        commit.topic = topic
+        self.commits.write_to_disk()
+
+def main():
+    parser = argparse.ArgumentParser(description='Tool to help categorize commits')
+    parser.add_argument('--category', type=str, default='Uncategorized',
+                        help='Which category to filter by. "Uncategorized", None, or a category name')
+    parser.add_argument('--file', help='The location of the commits CSV',
+                        default='results/commitlist.csv')
+
+    args = parser.parse_args()
+    categorizer = Categorizer(args.file, args.category)
+    categorizer.categorize()
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@ -0,0 +1,181 @@
+import argparse
+from common import run, topics
+from collections import namedtuple, defaultdict
+import os
+import csv
+import pprint
+from common import CommitDataCache
+import re
+
+
+"""
+Example Usages
+
+Create a new commitlist for consumption by categorize.py.
+Said commitlist contains commits between v1.5.0 and f5bc91f851.
+
+    python commitlist.py --create_new tags/v1.5.0 f5bc91f851
+
+Update the existing commitlist to commit bfcb687b9c.
+
+    python commitlist.py --update_to bfcb687b9c
+
+"""
+
+class Commit:
+    def __init__(self, commit_hash, category, topic, title):
+        self.commit_hash = commit_hash
+        self.category = category
+        self.topic = topic
+        self.title = title
+
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return False
+        return self.commit_hash == other.commit_hash and \
+            self.category == other.category and \
+            self.topic == other.topic and \
+            self.title == other.title
+
+    def __repr__(self):
+        return f'Commit({self.commit_hash}, {self.category}, {self.topic}, {self.title})'
+
+class CommitList:
+    # NB: Private ctor. Use `from_existing` or `create_new`.
+    def __init__(self, path, commits):
+        self.path = path
+        self.commits = commits
+
+    @staticmethod
+    def from_existing(path):
+        commits = CommitList.read_from_disk(path)
+        return CommitList(path, commits)
+
+    @staticmethod
+    def create_new(path, base_version, new_version):
+        if os.path.exists(path):
+            raise ValueError('Attempted to create a new commitlist but one exists already!')
+        commits = CommitList.get_commits_between(base_version, new_version)
+        return CommitList(path, commits)
+
+    @staticmethod
+    def read_from_disk(path):
+        with open(path) as csvfile:
+            reader = csv.reader(csvfile)
+            rows = list(row for row in reader)
+        assert all(len(row) >= 4 for row in rows)
+        return [Commit(*row[:4]) for row in rows]
+
+    def write_to_disk(self):
+        path = self.path
+        rows = self.commits
+        with open(path, 'w') as csvfile:
+            writer = csv.writer(csvfile)
+            for commit in rows:
+                writer.writerow([commit.commit_hash, commit.category, commit.topic, commit.title])
+
+    @staticmethod
+    def get_commits_between(base_version, new_version):
+        cmd = f'git merge-base {base_version} {new_version}'
+        rc, merge_base, _ = run(cmd)
+        assert rc == 0
+
+        # Returns a list of something like
+        # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
+        cmd = f'git log --reverse --oneline {merge_base}..{new_version}'
+        rc, commits, _ = run(cmd)
+        assert rc == 0
+
+        log_lines = commits.split('\n')
+        hashes, titles = zip(*[log_line.split(' ', 1) for log_line in log_lines])
+        return [Commit(commit_hash, 'Uncategorized', 'Untopiced', title) for commit_hash, title in zip(hashes, titles)]
+
+    def filter(self, *, category=None, topic=None):
+        commits = self.commits
+        if category is not None:
+            commits = [commit for commit in commits if commit.category == category]
+        if topic is not None:
+            commits = [commit for commit in commits if commit.topic == topic]
+        return commits
+ 
+    def update_to(self, new_version):
+        last_hash = self.commits[-1].commit_hash
+        new_commits = CommitList.get_commits_between(last_hash, new_version)
+        self.commits += new_commits
+
+    def stat(self):
+        counts = defaultdict(lambda: defaultdict(int))
+        for commit in self.commits:
+            counts[commit.category][commit.topic] += 1
+        return counts
+
+
+def create_new(path, base_version, new_version):
+    commits = CommitList.create_new(path, base_version, new_version)
+    commits.write_to_disk()
+
+def update_existing(path, new_version):
+    commits = CommitList.from_existing(path)
+    commits.update_to(new_version)
+    commits.write_to_disk()
+
+def to_markdown(commit_list, category):
+    def cleanup_title(commit):
+        match = re.match('(.*) \(#\d+\)', commit.title)
+        if match is None:
+            return commit.title
+        return match.group(1)
+
+    cdc = CommitDataCache()
+    lines = [f'\n## {category}\n']
+    for topic in topics:
+        lines.append(f'### {topic}\n')
+        commits = commit_list.filter(category=category, topic=topic)
+        for commit in commits:
+            result = cleanup_title(commit)
+            maybe_pr_number = cdc.get(commit.commit_hash).pr_number
+            if maybe_pr_number is None:
+                result = f'- {result} ({commit.commit_hash})\n'
+            else:
+                result = f'- {result} ([#{maybe_pr_number}](https://github.com/pytorch/pytorch/pull/{maybe_pr_number}))\n'
+            lines.append(result)
+    return lines
+
+def main():
+    parser = argparse.ArgumentParser(description='Tool to create a commit list')
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('--create_new', nargs=2)
+    group.add_argument('--update_to')
+    group.add_argument('--stat', action='store_true')
+    group.add_argument('--export_markdown', action='store_true')
+
+    parser.add_argument('--path', default='results/commitlist.csv')
+    args = parser.parse_args()
+
+    if args.create_new:
+        create_new(args.path, args.create_new[0], args.create_new[1])
+        return
+    if args.update_to:
+        update_existing(args.path, args.update_to)
+        return
+    if args.stat:
+        commits = CommitList.from_existing(args.path)
+        stats = commits.stat()
+        pprint.pprint(stats)
+        return
+    if args.export_markdown:
+        commits = CommitList.from_existing(args.path)
+        categories = list(commits.stat().keys())
+        lines = []
+        for category in categories:
+            lines += to_markdown(commits, category)
+        filename = f'results/result.md'
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        with open(filename, 'w') as f:
+            f.writelines(lines)
+        return
+    assert False
+
+if __name__ == '__main__':
+    main()
--- a/scripts/release_notes/common.py
+++ b/scripts/release_notes/common.py
@ -0,0 +1,196 @@
+from collections import namedtuple
+from os.path import expanduser
+import locale
+import subprocess
+import re
+import requests
+import os
+import json
+
+categories = [
+    'Uncategorized',
+    'distributed',
+    'mobile',
+    'jit',
+    'visualization',
+    'onnx',
+    'caffe2',
+    'quantization',
+    'amd',
+    'benchmark',
+    'profiler',
+    'dispatcher',
+    'releng',
+    'fx',
+    'code_coverage',
+    'vulkan',
+    'skip',
+    'cpp_frontend',
+    'python_frontend',
+    'complex_frontend',
+    'vmap_frontend',
+    'autograd_frontend',
+    'build_frontend',
+    'memory_format_frontend',
+    'foreach_frontend',
+]
+
+topics = [
+    'bc_breaking',
+    'deprecations',
+    'new_features',
+    'improvements',
+    'bug_fixes',
+    'performance',
+    'docs',
+    'devs',
+    'Untopiced',
+]
+
+
+Features = namedtuple('Features', [
+    'title',
+    'body',
+    'pr_number',
+    'files_changed',
+    'labels',
+])
+
+
+def dict_to_features(dct):
+    return Features(
+        title=dct['title'],
+        body=dct['body'],
+        pr_number=dct['pr_number'],
+        files_changed=dct['files_changed'],
+        labels=dct['labels'])
+
+
+def features_to_dict(features):
+    return dict(features._asdict())
+
+
+def run(command):
+    """Returns (return-code, stdout, stderr)"""
+    p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, shell=True)
+    output, err = p.communicate()
+    rc = p.returncode
+    enc = locale.getpreferredencoding()
+    output = output.decode(enc)
+    err = err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def commit_body(commit_hash):
+    cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
+    ret, out, err = run(cmd)
+    return out if ret == 0 else None
+
+
+def commit_title(commit_hash):
+    cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
+    ret, out, err = run(cmd)
+    return out if ret == 0 else None
+
+
+def commit_files_changed(commit_hash):
+    cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
+    ret, out, err = run(cmd)
+    return out.split('\n') if ret == 0 else None
+
+
+def parse_pr_number(body, commit_hash, title):
+    regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
+    matches = re.findall(regex, body)
+    if len(matches) == 0:
+        if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
+            print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
+        return None
+    if len(matches) > 1:
+        print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
+        return matches[0]
+    return matches[0]
+
+
+def get_ghstack_token():
+    pattern = 'github_oauth = (.*)'
+    with open(expanduser('~/.ghstackrc'), 'r+') as f:
+        config = f.read()
+    matches = re.findall(pattern, config)
+    if len(matches) == 0:
+        raise RuntimeError("Can't find a github oauth token")
+    return matches[0]
+
+token = get_ghstack_token()
+headers = {"Authorization": f"token {token}"}
+
+def run_query(query):
+    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
+    if request.status_code == 200:
+        return request.json()
+    else:
+        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
+
+
+def gh_labels(pr_number):
+    query = f"""
+    {{
+      repository(owner: "pytorch", name: "pytorch") {{
+        pullRequest(number: {pr_number}) {{
+          labels(first: 10) {{
+            edges {{
+              node {{
+                name
+              }}
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+    query = run_query(query)
+    edges = query['data']['repository']['pullRequest']['labels']['edges']
+    return [edge['node']['name'] for edge in edges]
+
+
+def get_features(commit_hash, return_dict=False):
+    title, body, files_changed = (
+        commit_title(commit_hash),
+        commit_body(commit_hash),
+        commit_files_changed(commit_hash))
+    pr_number = parse_pr_number(body, commit_hash, title)
+    labels = []
+    if pr_number is not None:
+        labels = gh_labels(pr_number)
+    result = Features(title, body, pr_number, files_changed, labels)
+    if return_dict:
+        return features_to_dict(result)
+    return result
+
+class CommitDataCache:
+    def __init__(self, path='results/data.json'):
+        self.path = path
+        self.data = {}
+        if os.path.exists(path):
+            self.data = self.read_from_disk()
+
+    def get(self, commit):
+        if commit not in self.data.keys():
+            # Fetch and cache the data
+            self.data[commit] = get_features(commit)
+            self.write_to_disk()
+        return self.data[commit]
+
+    def read_from_disk(self):
+        with open(self.path, 'r') as f:
+            data = json.load(f)
+            data = {commit: dict_to_features(dct)
+                    for commit, dct in data.items()}
+        return data
+
+    def write_to_disk(self):
+        data = {commit: features._asdict() for commit, features in self.data.items()}
+        with open(self.path, 'w') as f:
+            json.dump(data, f)
+
--- a/scripts/release_notes/requirements.txt
+++ b/scripts/release_notes/requirements.txt
@ -0,0 +1 @@
+PyGithub
--- a/scripts/release_notes/test_release_notes.py
+++ b/scripts/release_notes/test_release_notes.py
@ -0,0 +1,45 @@
+import unittest
+import tempfile
+from commitlist import CommitList
+
+class TestCommitList(unittest.TestCase):
+    def test_create_new(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            commit_list_path = f'{tempdir}/commitlist.csv'
+            commit_list = CommitList.create_new(commit_list_path, 'v1.5.0', '7543e7e558')
+            self.assertEqual(len(commit_list.commits), 2143)
+            self.assertEqual(commit_list.commits[0].commit_hash, '7335f079ab')
+            self.assertTrue(commit_list.commits[0].title.startswith('[pt][quant] qmul and qadd'))
+            self.assertEqual(commit_list.commits[-1].commit_hash, '7543e7e558')
+            self.assertTrue(commit_list.commits[-1].title.startswith('Migrate minall, max, maxall'))
+
+    def test_read_write(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            commit_list_path = f'{tempdir}/commitlist.csv'
+            initial = CommitList.create_new(commit_list_path, 'v1.5.0', '7543e7e558')
+            initial.write_to_disk()
+
+            expected = CommitList.from_existing(commit_list_path)
+            expected.commits[-2].category = 'foobar'
+            expected.write_to_disk()
+
+            commit_list = CommitList.from_existing(commit_list_path)
+            for commit, expected in zip(commit_list.commits, expected.commits):
+                self.assertEqual(commit, expected)
+
+    def test_update_to(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            commit_list_path = f'{tempdir}/commitlist.csv'
+            initial = CommitList.create_new(commit_list_path, 'v1.5.0', '7543e7e558')
+            initial.commits[-2].category = 'foobar'
+            self.assertEqual(len(initial.commits), 2143)
+            initial.write_to_disk()
+
+            commit_list = CommitList.from_existing(commit_list_path)
+            commit_list.update_to('5702a28b26')
+            self.assertEqual(len(commit_list.commits), 2143 + 4)
+            self.assertEqual(commit_list.commits[-5], initial.commits[-1])
+
+
+if __name__ == '__main__':
+    unittest.main()