File: //kunden/proc/self/root/lib/python3/dist-packages/breezy/plugins/fastimport/exporter.py
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Based on bzr-fast-export
# Copyright (c) 2008 Adeodato Simó
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# vim: fileencoding=utf-8
"""Core engine for the fast-export command."""
from __future__ import absolute_import
# TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
# is not updated (because the parent of commit is already merged, so we don't
# set new_git_branch to the previously used name)
try:
from email.utils import parseaddr
except ImportError: # python < 3
from email.Utils import parseaddr
import sys
import time
import re
import breezy.branch
import breezy.revision
from ... import (
builtins,
errors,
lazy_import,
lru_cache,
osutils,
progress,
trace,
)
from ...sixish import (
int2byte,
PY3,
viewitems,
)
from . import (
helpers,
marks_file,
)
lazy_import.lazy_import(globals(),
"""
from fastimport import commands
""")
REVISIONS_CHUNK_SIZE = 1000
def _get_output_stream(destination):
if destination is None or destination == '-':
return helpers.binary_stream(getattr(sys.stdout, "buffer", sys.stdout))
elif destination.endswith('.gz'):
import gzip
return gzip.open(destination, 'wb')
else:
return open(destination, 'wb')
# from dulwich.repo:
def check_ref_format(refname):
"""Check if a refname is correctly formatted.
Implements all the same rules as git-check-ref-format[1].
[1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html
:param refname: The refname to check
:return: True if refname is valid, False otherwise
"""
# These could be combined into one big expression, but are listed separately
# to parallel [1].
if b'/.' in refname or refname.startswith(b'.'):
return False
if b'/' not in refname:
return False
if b'..' in refname:
return False
for i in range(len(refname)):
if ord(refname[i:i + 1]) < 0o40 or refname[i] in b'\177 ~^:?*[':
return False
if refname[-1] in b'/.':
return False
if refname.endswith(b'.lock'):
return False
if b'@{' in refname:
return False
if b'\\' in refname:
return False
return True
def sanitize_ref_name_for_git(refname):
"""Rewrite refname so that it will be accepted by git-fast-import.
For the detailed rules see check_ref_format.
By rewriting the refname we are breaking uniqueness guarantees provided by bzr
so we have to manually
verify that resulting ref names are unique.
:param refname: refname to rewrite
:return: new refname
"""
new_refname = re.sub(
# '/.' in refname or startswith '.'
br"/\.|^\."
# '..' in refname
br"|\.\."
# ord(c) < 040
br"|[" + b"".join([int2byte(x) for x in range(0o40)]) + br"]"
# c in '\177 ~^:?*['
br"|[\177 ~^:?*[]"
# last char in "/."
br"|[/.]$"
# endswith '.lock'
br"|.lock$"
# "@{" in refname
br"|@{"
# "\\" in refname
br"|\\",
b"_", refname)
return new_refname
class BzrFastExporter(object):
def __init__(self, source, outf, ref=None, checkpoint=-1,
import_marks_file=None, export_marks_file=None, revision=None,
verbose=False, plain_format=False, rewrite_tags=False,
no_tags=False, baseline=False):
"""Export branch data in fast import format.
:param plain_format: if True, 'classic' fast-import format is
used without any extended features; if False, the generated
data is richer and includes information like multiple
authors, revision properties, etc.
:param rewrite_tags: if True and if plain_format is set, tag names
will be rewritten to be git-compatible.
Otherwise tags which aren't valid for git will be skipped if
plain_format is set.
:param no_tags: if True tags won't be exported at all
"""
self.branch = source
self.outf = outf
self.ref = ref
self.checkpoint = checkpoint
self.import_marks_file = import_marks_file
self.export_marks_file = export_marks_file
self.revision = revision
self.excluded_revisions = set()
self.plain_format = plain_format
self.rewrite_tags = rewrite_tags
self.no_tags = no_tags
self.baseline = baseline
self.tree_cache = lru_cache.LRUCache(max_cache=20)
self._multi_author_api_available = hasattr(breezy.revision.Revision,
'get_apparent_authors')
self.properties_to_exclude = ['authors', 'author']
# Progress reporting stuff
self.verbose = verbose
if verbose:
self.progress_every = 100
else:
self.progress_every = 1000
self._start_time = time.time()
self._commit_total = 0
# Load the marks and initialise things accordingly
self.revid_to_mark = {}
self.branch_names = {}
if self.import_marks_file:
marks_info = marks_file.import_marks(self.import_marks_file)
if marks_info is not None:
self.revid_to_mark = dict((r, m) for m, r in
marks_info.items())
# These are no longer included in the marks file
#self.branch_names = marks_info[1]
def interesting_history(self):
if self.revision:
rev1, rev2 = builtins._get_revision_range(
self.revision, self.branch, "fast-export")
start_rev_id = rev1.rev_id
end_rev_id = rev2.rev_id
else:
start_rev_id = None
end_rev_id = None
self.note("Calculating the revisions to include ...")
view_revisions = [rev_id for rev_id, _, _, _ in
self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)]
view_revisions.reverse()
# If a starting point was given, we need to later check that we don't
# start emitting revisions from before that point. Collect the
# revisions to exclude now ...
if start_rev_id is not None:
self.note("Calculating the revisions to exclude ...")
self.excluded_revisions = set(
[rev_id for rev_id, _, _, _ in self.branch.iter_merge_sorted_revisions(start_rev_id)])
if self.baseline:
# needed so the first relative commit knows its parent
self.excluded_revisions.remove(start_rev_id)
view_revisions.insert(0, start_rev_id)
return list(view_revisions)
def emit_commits(self, interesting):
if self.baseline:
revobj = self.branch.repository.get_revision(interesting.pop(0))
self.emit_baseline(revobj, self.ref)
for i in range(0, len(interesting), REVISIONS_CHUNK_SIZE):
chunk = interesting[i:i + REVISIONS_CHUNK_SIZE]
history = dict(self.branch.repository.iter_revisions(chunk))
trees_needed = set()
trees = {}
for revid in chunk:
trees_needed.update(self.preprocess_commit(revid, history[revid], self.ref))
for tree in self._get_revision_trees(trees_needed):
trees[tree.get_revision_id()] = tree
for revid in chunk:
revobj = history[revid]
if len(revobj.parent_ids) == 0:
parent = breezy.revision.NULL_REVISION
else:
parent = revobj.parent_ids[0]
self.emit_commit(revobj, self.ref, trees[parent], trees[revid])
def run(self):
# Export the data
with self.branch.repository.lock_read():
interesting = self.interesting_history()
self._commit_total = len(interesting)
self.note("Starting export of %d revisions ..." %
self._commit_total)
if not self.plain_format:
self.emit_features()
self.emit_commits(interesting)
if self.branch.supports_tags() and not self.no_tags:
self.emit_tags()
# Save the marks if requested
self._save_marks()
self.dump_stats()
def note(self, msg, *args):
"""Output a note but timestamp it."""
msg = "%s %s" % (self._time_of_day(), msg)
trace.note(msg, *args)
def warning(self, msg, *args):
"""Output a warning but timestamp it."""
msg = "%s WARNING: %s" % (self._time_of_day(), msg)
trace.warning(msg, *args)
def _time_of_day(self):
"""Time of day as a string."""
# Note: this is a separate method so tests can patch in a fixed value
return time.strftime("%H:%M:%S")
def report_progress(self, commit_count, details=''):
if commit_count and commit_count % self.progress_every == 0:
if self._commit_total:
counts = "%d/%d" % (commit_count, self._commit_total)
else:
counts = "%d" % (commit_count,)
minutes = (time.time() - self._start_time) / 60
rate = commit_count * 1.0 / minutes
if rate > 10:
rate_str = "at %.0f/minute " % rate
else:
rate_str = "at %.1f/minute " % rate
self.note("%s commits exported %s%s" % (counts, rate_str, details))
def dump_stats(self):
time_required = progress.str_tdelta(time.time() - self._start_time)
rc = len(self.revid_to_mark)
self.note("Exported %d %s in %s",
rc, helpers.single_plural(rc, "revision", "revisions"),
time_required)
def print_cmd(self, cmd):
if PY3:
self.outf.write(b"%s\n" % cmd)
else:
self.outf.write(b"%r\n" % cmd)
def _save_marks(self):
if self.export_marks_file:
revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
marks_file.export_marks(self.export_marks_file, revision_ids)
def is_empty_dir(self, tree, path):
# Continue if path is not a directory
try:
if tree.kind(path) != 'directory':
return False
except errors.NoSuchFile:
self.warning("Skipping empty_dir detection - no file_id for %s" %
(path,))
return False
# Use treewalk to find the contents of our directory
contents = list(tree.walkdirs(prefix=path))[0]
if len(contents[1]) == 0:
return True
else:
return False
def emit_features(self):
for feature in sorted(commands.FEATURE_NAMES):
self.print_cmd(commands.FeatureCommand(feature))
def emit_baseline(self, revobj, ref):
# Emit a full source tree of the first commit's parent
mark = 1
self.revid_to_mark[revobj.revision_id] = mark
tree_old = self.branch.repository.revision_tree(
breezy.revision.NULL_REVISION)
[tree_new] = list(self._get_revision_trees([revobj.revision_id]))
file_cmds = self._get_filecommands(tree_old, tree_new)
self.print_cmd(commands.ResetCommand(ref, None))
self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
def preprocess_commit(self, revid, revobj, ref):
if revid in self.revid_to_mark or revid in self.excluded_revisions:
return
if revobj is None:
# This is a ghost revision. Mark it as not found and next!
self.revid_to_mark[revid] = -1
return
# Get the primary parent
# TODO: Consider the excluded revisions when deciding the parents.
# Currently, a commit with parents that are excluded ought to be
# triggering the ref calculation below (and it is not).
# IGC 20090824
if len(revobj.parent_ids) == 0:
parent = breezy.revision.NULL_REVISION
else:
parent = revobj.parent_ids[0]
# Print the commit
mark = len(self.revid_to_mark) + 1
self.revid_to_mark[revobj.revision_id] = mark
return [parent, revobj.revision_id]
def emit_commit(self, revobj, ref, tree_old, tree_new):
# For parentless commits we need to issue reset command first, otherwise
# git-fast-import will assume previous commit was this one's parent
if tree_old.get_revision_id() == breezy.revision.NULL_REVISION:
self.print_cmd(commands.ResetCommand(ref, None))
file_cmds = self._get_filecommands(tree_old, tree_new)
mark = self.revid_to_mark[revobj.revision_id]
self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
# Report progress and checkpoint if it's time for that
ncommits = len(self.revid_to_mark)
self.report_progress(ncommits)
if (self.checkpoint is not None and self.checkpoint > 0 and ncommits and
ncommits % self.checkpoint == 0):
self.note("Exported %i commits - adding checkpoint to output"
% ncommits)
self._save_marks()
self.print_cmd(commands.CheckpointCommand())
def _get_name_email(self, user):
if user.find('<') == -1:
# If the email isn't inside <>, we need to use it as the name
# in order for things to round-trip correctly.
# (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
name = user
email = ''
else:
name, email = parseaddr(user)
return name.encode("utf-8"), email.encode("utf-8")
def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
# Get the committer and author info
committer = revobj.committer
name, email = self._get_name_email(committer)
committer_info = (name, email, revobj.timestamp, revobj.timezone)
if self._multi_author_api_available:
more_authors = revobj.get_apparent_authors()
author = more_authors.pop(0)
else:
more_authors = []
author = revobj.get_apparent_author()
if not self.plain_format and more_authors:
name, email = self._get_name_email(author)
author_info = (name, email, revobj.timestamp, revobj.timezone)
more_author_info = []
for a in more_authors:
name, email = self._get_name_email(a)
more_author_info.append(
(name, email, revobj.timestamp, revobj.timezone))
elif author != committer:
name, email = self._get_name_email(author)
author_info = (name, email, revobj.timestamp, revobj.timezone)
more_author_info = None
else:
author_info = None
more_author_info = None
# Get the parents in terms of marks
non_ghost_parents = []
for p in revobj.parent_ids:
if p in self.excluded_revisions:
continue
try:
parent_mark = self.revid_to_mark[p]
non_ghost_parents.append(b":%d" % parent_mark)
except KeyError:
# ghost - ignore
continue
if non_ghost_parents:
from_ = non_ghost_parents[0]
merges = non_ghost_parents[1:]
else:
from_ = None
merges = None
# Filter the revision properties. Some metadata (like the
# author information) is already exposed in other ways so
# don't repeat it here.
if self.plain_format:
properties = None
else:
properties = revobj.properties
for prop in self.properties_to_exclude:
try:
del properties[prop]
except KeyError:
pass
# Build and return the result
return commands.CommitCommand(
git_ref, mark, author_info, committer_info,
revobj.message.encode("utf-8"), from_, merges, file_cmds,
more_authors=more_author_info, properties=properties)
def _get_revision_trees(self, revids):
missing = []
by_revid = {}
for revid in revids:
if revid == breezy.revision.NULL_REVISION:
by_revid[revid] = self.branch.repository.revision_tree(revid)
elif revid not in self.tree_cache:
missing.append(revid)
for tree in self.branch.repository.revision_trees(missing):
by_revid[tree.get_revision_id()] = tree
for revid in revids:
try:
yield self.tree_cache[revid]
except KeyError:
yield by_revid[revid]
for revid, tree in by_revid.items():
self.tree_cache[revid] = tree
def _get_filecommands(self, tree_old, tree_new):
"""Get the list of FileCommands for the changes between two revisions."""
changes = tree_new.changes_from(tree_old)
my_modified = list(changes.modified)
# The potential interaction between renames and deletes is messy.
# Handle it here ...
file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
changes.renamed, changes.removed, tree_new.get_revision_id(), tree_old)
for cmd in file_cmds:
yield cmd
# Map kind changes to a delete followed by an add
for change in changes.kind_changed:
path = self._adjust_path_for_renames(
path, renamed, tree_new.get_revision_id())
# IGC: I don't understand why a delete is needed here.
# In fact, it seems harmful? If you uncomment this line,
# please file a bug explaining why you needed to.
# yield commands.FileDeleteCommand(path)
my_modified.append(change)
# Record modifications
files_to_get = []
for change in changes.added + changes.copied + my_modified + rd_modifies:
if change.kind[1] == 'file':
files_to_get.append(
(change.path[1],
(change.path[1], helpers.kind_to_mode(
'file', change.executable[1]))))
elif change.kind[1] == 'symlink':
yield commands.FileModifyCommand(
change.path[1].encode("utf-8"),
helpers.kind_to_mode('symlink', False),
None, tree_new.get_symlink_target(
change.path[1]).encode('utf-8'))
elif change.kind[1] == 'directory':
if not self.plain_format:
yield commands.FileModifyCommand(
change.path[1].encode("utf-8"),
helpers.kind_to_mode('directory', False), None,
None)
else:
self.warning("cannot export '%s' of kind %s yet - ignoring" %
(change.path[1], change.kind[1]))
# TODO(jelmer): Improve performance on remote repositories
# by using Repository.iter_files_bytes for bzr repositories here.
for (path, mode), chunks in tree_new.iter_files_bytes(files_to_get):
yield commands.FileModifyCommand(
path.encode("utf-8"), mode, None, b''.join(chunks))
def _process_renames_and_deletes(self, renames, deletes,
revision_id, tree_old):
file_cmds = []
modifies = []
renamed = []
# See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
# In a nutshell, there are several nasty cases:
#
# 1) bzr rm a; bzr mv b a; bzr commit
# 2) bzr mv x/y z; bzr rm x; commmit
#
# The first must come out with the delete first like this:
#
# D a
# R b a
#
# The second case must come out with the rename first like this:
#
# R x/y z
# D x
#
# So outputting all deletes first or all renames first won't work.
# Instead, we need to make multiple passes over the various lists to
# get the ordering right.
must_be_renamed = {}
old_to_new = {}
deleted_paths = set([change.path[0] for change in deletes])
for change in renames:
emit = change.kind[1] != 'directory' or not self.plain_format
if change.path[1] in deleted_paths:
if emit:
file_cmds.append(commands.FileDeleteCommand(
change.path[1].encode("utf-8")))
deleted_paths.remove(change.path[1])
if (self.is_empty_dir(tree_old, change.path[0])):
self.note("Skipping empty dir %s in rev %s" % (change.path[0],
revision_id))
continue
# oldpath = self._adjust_path_for_renames(oldpath, renamed,
# revision_id)
renamed.append(change.path)
old_to_new[change.path[0]] = change.path[1]
if emit:
file_cmds.append(
commands.FileRenameCommand(change.path[0].encode("utf-8"), change.path[1].encode("utf-8")))
if change.changed_content or change.meta_modified():
modifies.append(change)
# Renaming a directory implies all children must be renamed.
# Note: changes_from() doesn't handle this
if change.kind == ('directory', 'directory'):
for p, e in tree_old.iter_entries_by_dir(specific_files=[change.path[0]]):
if e.kind == 'directory' and self.plain_format:
continue
old_child_path = osutils.pathjoin(change.path[0], p)
new_child_path = osutils.pathjoin(change.path[1], p)
must_be_renamed[old_child_path] = new_child_path
# Add children not already renamed
if must_be_renamed:
renamed_already = set(old_to_new.keys())
still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
for old_child_path in sorted(still_to_be_renamed):
new_child_path = must_be_renamed[old_child_path]
if self.verbose:
self.note("implicitly renaming %s => %s" % (old_child_path,
new_child_path))
file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"),
new_child_path.encode("utf-8")))
# Record remaining deletes
for change in deletes:
if change.path[0] not in deleted_paths:
continue
if change.kind[0] == 'directory' and self.plain_format:
continue
#path = self._adjust_path_for_renames(path, renamed, revision_id)
file_cmds.append(commands.FileDeleteCommand(change.path[0].encode("utf-8")))
return file_cmds, modifies, renamed
def _adjust_path_for_renames(self, path, renamed, revision_id):
# If a previous rename is found, we should adjust the path
for old, new in renamed:
if path == old:
self.note("Changing path %s given rename to %s in revision %s"
% (path, new, revision_id))
path = new
elif path.startswith(old + '/'):
self.note(
"Adjusting path %s given rename of %s to %s in revision %s"
% (path, old, new, revision_id))
path = path.replace(old + "/", new + "/")
return path
def emit_tags(self):
for tag, revid in viewitems(self.branch.tags.get_tag_dict()):
try:
mark = self.revid_to_mark[revid]
except KeyError:
self.warning('not creating tag %r pointing to non-existent '
'revision %s' % (tag, revid))
else:
git_ref = b'refs/tags/%s' % tag.encode("utf-8")
if self.plain_format and not check_ref_format(git_ref):
if self.rewrite_tags:
new_ref = sanitize_ref_name_for_git(git_ref)
self.warning('tag %r is exported as %r to be valid in git.',
git_ref, new_ref)
git_ref = new_ref
else:
self.warning('not creating tag %r as its name would not be '
'valid in git.', git_ref)
continue
self.print_cmd(commands.ResetCommand(git_ref, b":%d" % mark))