""" Playbooks for running full analyses """
import os
import logging
import shutil
from urllib.parse import urljoin
from okra.assn4 import get_truck_factor_by_project
from okra.error_handling import (MissingEnvironmentVariableError,
NetworkError,
DirectoryNotCreatedError)
from okra.models import DataAccessLayer
from okra.populate_db import populate_db
from okra.repo_mgmt import (create_parent_dir, clone_repo, update_repo,
compress_repo, decompress_repo)
logger = logging.getLogger(__name__)
[docs]def local_persistance(repo_name: str, parent_dir: str, buffer_size=4096):
""" Collect relevant data for locally cloned repos.
:param repo_name: name of git repository, <linux>
:param parent_dir: parent directory path, </home/user/code/>
:param buffer_size: number of records processed before db commit
:return: populate sqlite database for a repo
:rtype: None
"""
logger.info("STARTED -- local persistance")
repodb = "__REPODB__".join(repo_name.split("/"))
cache = parent_dir
dburl = "sqlite:///" + cache + repodb + ".db"
populate_db(dburl, cache, repo_name, buffer_size)
logger.info("FINISHED -- local perisistance")
[docs]def simple_version_truck_factor(repos: list, dirpath: str, dburl: str, b:int):
""" Simple version of the truck factor analysis.
This is a basic version of the truck factor which
does not attempt to run the analysis at scale. It's
just a proof of concept version. Writes out a csv file with
the truck factor of each repository. You can use this csv file
to do further analysis in R.
:param repos: repo queue with value format '<repo owner>/<repo name>'
:param dirpath: path to working directory to store git repos
:param dburl: database url (ex. 'sqlite:///:memory')
:param b: batch size (ex. 1024)
:return: outputs truck factor analysis as csv file
:rtype: None, writes out csv file
"""
logger.info("STARTED -- simple-version truck factor")
# Retrieve or update git repos
for repo_name in repos:
rpath = urljoin(dirpath, repo_name)
create_parent_dir(repo_name, dirpath)
if os.path.exists(rpath):
update_repo(repo_name, dirpath)
else:
clone_repo(repo_name, dirpath)
# Populate database
populate_db(dburl, dirpath, repos, b)
# Compute truck factor for each project
dal = DataAccessLayer(dburl)
dal.connect()
dal.session = dal.Session()
results = []
for repo_name in repos:
owner, project = repo_name.split("/")
truck_factor, _ = get_truck_factor_by_project(owner, project, dal)
results.append((repo_name, truck_factor))
dal.session.close()
logger.info("COMPLETED -- simple-version truck factor")
return results
def retrieve_or_clone(repo_name: str, dirpath: str) -> bool:
# check s3 bucket for repo
repopath = urljoin(dirpath, repo_name)
if os.path.exists(repopath): # may already exist
return True
return clone_repo(repo_name, dirpath)