Source code for okra.gitlogs

""" Handle the data requirements for Assignment 1 

References:
   Assignment 1: "http://janvitek.org/events/NEU/6050/a1.html"
   Git log formatting: "https://git-scm.com/docs/pretty-formats"
"""
import csv
import logging
import os
import subprocess

from okra.repo_mgmt import read_repos
from okra.proto.assn1_pb2 import Commit, Message, File, Inventory

logger = logging.getLogger(__name__)

[docs]def parse_inventory(rpath: str, repo_name: str):
    """ Return inventory information for a git repo.

    :param rpath: repository path
    :return: inventory message object
    :rtype: okra.protobuf.assn1_pb2.Inventory
    """
    owner, project = repo_name.split("/")

    c = ["git", "log", "-1", "--pretty=%H"]
    res = subprocess.run(c, cwd=rpath, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    if res.returncode == 0:
        logger.info("SUCCESS - extracted inventory info")
        hash_val = res.stdout.decode('utf-8', 'ignore').strip()

        inv = Inventory()
        inv.owner = owner
        inv.project = project
        inv.last_hash = hash_val
        return inv

    else:
        logger.error("FAIL - {}, {}, inventory not extracted".\
                     format(repo_name, rpath))
    


[docs]def parse_commits(rpath: str, chash=""):
    """ Yields a protocol buffer of git commit information.

    commits.csv collects basic information about 
    commits and contains the following columns:

    :param rpath: path to git repository
    :param chash: optional param, retrieve all commits since commit hash
    :return: :class: okra.protobuf.assn1_pb2.Commit
    :rtype: generator, protocol buffer
    """
    if len(chash) == 0:
        c1 = ["git", "log",
              "--pretty=%H^|^%an^|^%ae^|^%aI^|^%cn^|^%ce^|^%cI"]
    else:
        c1 = ["git", "log",
              "--pretty=%H^|^%an^|^%ae^|^%aI^|^%cn^|^%ce^|^%cI",
              "{}..HEAD".format(chash)]

    res = subprocess.run(c1, cwd=rpath, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    if res.returncode == 0:
        logger.info("SUCCESS -- extracted git commit info")

        rows = res.stdout.splitlines()

        for row_num, row in enumerate(rows):

            items = row.decode('utf-8', 'ignore').split("^|^")

            if len(items) == 7:
                commit = Commit()

                commit.hash_val = items[0]
                commit.author = items[1]
                commit.author_email = items[2]
                commit.author_timestamp = items[3]
                commit.committer = items[4]
                commit.committer_email = items[5]
                commit.committer_timestamp = items[6]
                
                yield commit

            else:
                logger.error("Issue with row {}, repo '{}'".\
                             format(row_num, rpath))            
    else:
        logger.error("FAIL -- unable to extract git commits info")

[docs]def write_line_commits(parsed_commits):
    """ Generate a line for each git commit message. """
    for commit in parsed_commits:

        row = [
            commit.hash_val,
            commit.author,
            commit.author_email,
            commit.author_timestamp,
            commit.committer,
            commit.committer_email,
            commit.committer_timestamp,
        ]
        yield row

[docs]def parse_messages(rpath: str, chash=""):
    """ Yields a protocol buffer of a git commit message.
    
    messages.csv collects commit messages and their 
    subject.

    :param rpath: path to git repository
    :param chash: optional param, retrieve all commits since commit hash
    :return: :class: okra.protobuf.assn1_pb2.Message
    :rtype: generator, protocol buffer
    """
    if len(chash) == 0:
        c1 = ["git", "log",
              "--pretty=^^!^^%H^|^%s^|^%b^|^%aI"]
    else:
        c1 = ["git", "log",
              "--pretty=^^!^^%H^|^%s^|^%b^|^%aI",
              "{}..HEAD".format(chash)]
        
    res = subprocess.run(c1, cwd=rpath, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    if res.returncode == 0:
        logger.info("SUCCESS -- extracted messages_csv info")

        rows = res.stdout.decode('utf-8', 'ignore').split("^^!^^")

        for row_num, row in enumerate(rows):

            items = row.split("^|^")

            if len(items) == 4:

                message = Message()

                message.hash_val = items[0]
                message.subject = items[1]
                message.message_body = items[2]
                message.timestamp = items[3].strip()

                yield message
                
            else:
                logger.error("Issue with row {}, repo '{}'".\
                             format(row_num, rpath))
    else:
        logger.error("FAIL -- unable to extract messages_csv")

[docs]def write_line_messages(parsed_messages):
    """ Generate a line for each git commit message. """

    for message in parsed_messages:

        row = [
            message.hash_val,
            message.subject,
            message.message_body,
        ]
        yield row

[docs]def parse_committed_files(rpath: str, chash=''):
    """ Parse file format from git log tool. 

    :param rpath: path to git repository
    :param chash: optional param, retrieve all commits since commit hash
    :return: :class: okra.protobuf.assn1_pb2.Message
    :rtype: generator, protocol buffer
    """
    if len(chash) == 0:
        c1 = ["git", "log",
              '--pretty=^|^%n%H',
              '--numstat']
    else:
        c1 = ["git", "log",
              '--pretty=^|^%n%H',
              '--numstat',
              "{}..HEAD".format(chash)]

    res = subprocess.run(c1, cwd=rpath, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    if res.returncode != 0:
        logger.error("Unable find file info: {}".format(rpath))        
    
    items = res.stdout.decode('utf-8','ignore').split("^|^")

    for row_num, row in enumerate(items):

        grp = row.strip().splitlines()
        count = 0

        if len(grp) > 0:

            commit_hash = grp[0]

            for file_item in grp[1:]:
                props = file_item.split()

                if len(props) == 0:
                    continue

                finfo = File()
                finfo.hash_val = commit_hash
                finfo.added = props[0]
                finfo.deleted = props[1]
                finfo.file_path = props[2]

                yield finfo

[docs]def write_line_files(parsed_files):
    """ Generate a line for each git filepath message. """

    for file_item in parsed_files:

        row = [
            file_item.hash_val,
            file_item.file_path,
        ]

        yield row

[docs]def extract_data_main(fpath: str, dirpath: str):
    """ Extract data as requested in Assignment 1. """
    logger.info("STARTED data extraction")
    
    commits = "commits.csv"
    messages = "messages.csv"
    files = "files.csv"
    
    outfiles = [{"parse"     : parse_commits,
                 "line"      : write_line_commits,
                 "file_name" : commits},
                {"parse"     : parse_messages,
                 "line"      : write_line_messages,
                 "file_name" : messages},
                {"parse"     : parse_files,
                 "line"      : write_line_files,
                 "file_name" : files}]

    repos = read_repos(fpath)
    logger.info("Extracting data for {} git repos".format(len(repos)))

    outs = {
        commits : open(dirpath + commits, "w"),
        messages: open(dirpath + messages, "w"),
        files: open(dirpath + files, "w"),
    }

    repo_count = 0
    for repo_name in repos:

        rpath = dirpath + repo_name

        logger.info("Extracting data in repo '{}'".format(rpath))

        for out_item in outfiles:

            lines = out_item["line"](out_item["parse"](rpath))

            logger.info("Adding to file '{}'".format(out_item["file_name"]))

            writer = csv.writer(outs[out_item["file_name"]],
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

            for line in lines:

                writer.writerow(line)

    for key in outs.keys():

        outs[key].close()

    logger.info("Closed files")
    logger.info("FINISHED data extraction")