python转载[Rsync Algorithm]

 

转自:http://code.activestate.com/recipes/577518-rsync-algorithm/?in=lang-python

 

代码:Tested in Python 2.5, 2.6, and 3.1. In 2.7, io.BufferedReader should yield the best throughput. On all other versions use __builtin__.open.

#!/usr/bin/env python
#
 -*- coding: utf-8 -*-
"""
This is a pure Python implementation of the [rsync algorithm](TM96).

[TM96] Andrew Tridgell and Paul Mackerras. The rsync algorithm.
Technical Report TR-CS-96-05, Canberra 0200 ACT, Australia, 1996.
http://samba.anu.edu.au/rsync/.

### Example Use Case: ###

    # On the system containing the file that needs to be patched
    >>> unpatched = open("unpatched.file", "rb")
    >>> hashes = blockchecksums(unpatched)

    # On the remote system after having received `hashes`
    >>> patchedfile = open("patched.file", "rb")
    >>> delta = rsyncdelta(patchedfile, hashes)

    # System with the unpatched file after receiving `delta`
    >>> unpatched.seek(0)
    >>> save_to = open("locally-patched.file", "wb")
    >>> patchstream(unpatched, save_to, delta)
"""

import collections
import hashlib

if not(hasattr(__builtins__"bytes")) or str is bytes:
    
# Python 2.x compatibility
    def bytes(var, *args):
        
try:
            
return ''.join(map(chr, var))
        
except TypeError:
            
return map(ord, var)

__all__ = ["rollingchecksum""weakchecksum""patchstream""rsyncdelta",
    
"blockchecksums"]


def rsyncdelta(datastream, remotesignatures, blocksize=4096):
    
"""
    Generates a binary patch when supplied with the weak and strong
    hashes from an unpatched target and a readable stream for the
    up-to-date data. The blocksize must be the same as the value
    used to generate remotesignatures.
    
"""
    remote_weak, remote_strong 
= remotesignatures

    match 
= True
    matchblock 
= -1
    deltaqueue 
= collections.deque()

    
while True:
        
if match and datastream is not None:
            
# Whenever there is a match or the loop is running for the first
            # time, populate the window using weakchecksum instead of rolling
            # through every single byte which takes at least twice as long.
            window = collections.deque(bytes(datastream.read(blocksize)))
            checksum, a, b 
= weakchecksum(window)

        
try:
            
# If there are two identical weak checksums in a file, and the
            # matching strong hash does not occur at the first match, it will
            # be missed and the data sent over. May fix eventually, but this
            # problem arises very rarely.
            matchblock = remote_weak.index(checksum, matchblock + 1)
            stronghash 
= hashlib.md5(bytes(window)).hexdigest()
            matchblock 
= remote_strong.index(stronghash, matchblock)

            match 
= True
            deltaqueue.append(matchblock)

            
if datastream.closed:
                
break
            
continue

        
except ValueError:
            
# The weakchecksum did not match
            match = False
            
try:
                
if datastream:
                    
# Get the next byte and affix to the window
                    newbyte = ord(datastream.read(1))
                    window.append(newbyte)
            
except TypeError:
                
# No more data from the file; the window will slowly shrink.
                # newbyte needs to be zero from here on to keep the checksum
                # correct.
                newbyte = 0
                tailsize 
= datastream.tell() % blocksize
                datastream 
= None

            
if datastream is None and len(window) <= tailsize:
                
# The likelihood that any blocks will match after this is
                # nearly nil so call it quits.
                deltaqueue.append(window)
                
break

            
# Yank off the extra byte and calculate the new window checksum
            oldbyte = window.popleft()
            checksum, a, b 
= rollingchecksum(oldbyte, newbyte, a, b, blocksize)

            
# Add the old byte the file delta. This is data that was not found
            # inside of a matching block so it needs to be sent to the target.
            try:
                deltaqueue[
-1].append(oldbyte)
            
except (AttributeError, IndexError):
                deltaqueue.append([oldbyte])

    
# Return a delta that starts with the blocksize and converts all iterables
    # to bytes.
    deltastructure = [blocksize]
    
for element in deltaqueue:
        
if isinstance(element, int):
            deltastructure.append(element)
        
elif element:
            deltastructure.append(bytes(element))

    
return deltastructure


def blockchecksums(instream, blocksize=4096):
    
"""
    Returns a list of weak and strong hashes for each block of the
    defined size for the given data stream.
    
"""
    weakhashes 
= list()
    stronghashes 
= list()
    read 
= instream.read(blocksize)

    
while read:
        weakhashes.append(weakchecksum(bytes(read))[0])
        stronghashes.append(hashlib.md5(read).hexdigest())
        read 
= instream.read(blocksize)

    
return weakhashes, stronghashes


def patchstream(instream, outstream, delta):
    
"""
    Patches instream using the supplied delta and write the resultantant
    data to outstream.
    
"""
    blocksize 
= delta[0]

    
for element in delta[1:]:
        
if isinstance(element, int) and blocksize:
            instream.seek(element 
* blocksize)
            element 
= instream.read(blocksize)
        outstream.write(element)


def rollingchecksum(removed, new, a, b, blocksize=4096):
    
"""
    Generates a new weak checksum when supplied with the internal state
    of the checksum calculation for the previous window, the removed
    byte, and the added byte.
    
"""
    a 
-= removed - new
    b 
-= removed * blocksize - a
    
return (b << 16| a, a, b


def weakchecksum(data):
    
"""
    Generates a weak checksum from an iterable set of bytes.
    
"""
    a 
= b = 0
    l 
= len(data)
    
for i in range(l):
        a 
+= data[i]
        b 
+= (l - i)*data[i]

    
return (b << 16| a, a, b

 

 

测试:

# On the system containing the file that needs to be patched 
>>> unpatched = open("unpatched.file", "rb") 
>>> hashes = blockchecksums(unpatched) 
 
# On the remote system after having received `hashes` 
>>> patchedfile = open("patched.file", "rb") 
>>> delta = rsyncdelta(patchedfile, hashes) 
 
# System with the unpatched file after receiving `delta` 
>>> unpatched.seek(0) 
>>> save_to = open("locally-patched.file", "wb") 
>>> patchstream(unpatched, save_to, delta) 

 

rsync算法:http://www.cnblogs.com/itech/archive/2010/06/13/1757952.html

 

完!


 

posted @ 2011-01-20 18:43  iTech  阅读(1835)  评论(0编辑  收藏  举报