ASPN ActiveState Programmer Network  
sign in | join ActiveState, a division of Sophos
/ Home / Perl / PHP / Python / Tcl / XSLT /
/ Safari / My ASPN /
Cookbooks | Documentation | Mailing Lists | Modules | News Feeds | Products | User Groups | Web Services
Submit Recipe
My Recipes

All Recipes
All Cookbooks


View by Category

Title: Simple File Splitter/Combiner module
Submitter: Anand Pillai (other recipes)
Last Updated: 2003/11/06
Version no: 1.3
Category: Files

 

4 stars 1 vote(s)


Description:

This module can be used to split any file, text or binary
to equal sized chunks. It can also combine the chunks back
to recreate the original file.

Source: Text Source

""" FileSplitter - Simple Python file split/concat module.

    What it does
    -==========-
    
    1. Split a text/binary file into equal sized chunks
       and save them separately. 

    2. Concat existing chunks and recreate
       original file.

    Author: Anand Pillai
    Copyright : None, (Public Domain)
"""

import os, sys

class FileSplitterException(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return str(self.value)

def usage():
    return """\nUsage: FileSplitter.py -i <inputfile> -n <chunksize> [option]\n
    Options:\n
    -s, --split  Split file into chunks
    -j, --join   Join chunks back to file.
    """

class FileSplitter:
    """ File splitter class """

    def __init__(self):

        # cache filename
        self.__filename = ''
        # number of equal sized chunks
        self.__numchunks = 5
        # Size of each chunk
        self.__chunksize = 0
        # Optional postfix string for the chunk filename
        self.__postfix = ''
        # Program name
        self.__progname = "FileSplitter.py"
        # Action
        self.__action = 0 # split

    def parseOptions(self, args):

        import getopt

        try:
            optlist, arglist = getopt.getopt(args, 'sji:n:', ["split=", "join="])
        except getopt.GetoptError, e:
            print e
            return None

        for option, value in optlist:
            if option.lower() in ('-i', ):
                self.__filename = value
            elif option.lower() in ('-n', ):
                self.__numchunks = int(value)
            elif option.lower() in ('-s', '--split'):
                self.__action = 0 # split
            elif option.lower() in ('-j', '--join'):
                self.__action = 1 # combine

        if not self.__filename:
            sys.exit("Error: filename not given")
        
    def do_work(self):
        if self.__action==0:
            self.split()
        elif self.__action==1:
            self.combine()
        else:
            return None
        
    def split(self):
        """ Split the file and save chunks
        to separate files """

        print 'Splitting file', self.__filename
        print 'Number of chunks', self.__numchunks, '\n'
        
        try:
            f = open(self.__filename, 'rb')
        except (OSError, IOError), e:
            raise FileSplitterException, str(e)

        bname = (os.path.split(self.__filename))[1]
        # Get the file size
        fsize = os.path.getsize(self.__filename)
        # Get size of each chunk
        self.__chunksize = int(float(fsize)/float(self.__numchunks))

        chunksz = self.__chunksize
        total_bytes = 0

        for x in range(self.__numchunks):
            chunkfilename = bname + '-' + str(x+1) + self.__postfix

            # if reading the last section, calculate correct
            # chunk size.
            if x == self.__numchunks - 1:
                chunksz = fsize - total_bytes

            try:
                print 'Writing file',chunkfilename
                data = f.read(chunksz)
                total_bytes += len(data)
                chunkf = file(chunkfilename, 'wb')
                chunkf.write(data)
                chunkf.close()
            except (OSError, IOError), e:
                print e
                continue
            except EOFError, e:
                print e
                break

        print 'Done.'

    def sort_index(self, f1, f2):

        index1 = f1.rfind('-')
        index2 = f2.rfind('-')
        
        if index1 != -1 and index2 != -1:
            i1 = int(f1[index1:len(f1)])
            i2 = int(f2[index2:len(f2)])
            return i2 - i1
        
    def combine(self):
        """ Combine existing chunks to recreate the file.
        The chunks must be present in the cwd. The new file
        will be written to cwd. """

        import re
        
        print 'Creating file', self.__filename
        
        bname = (os.path.split(self.__filename))[1]
        bname2 = bname
        
        # bugfix: if file contains characters like +,.,[]
        # properly escape them, otherwise re will fail to match.
        for a, b in zip(['+', '.', '[', ']','$', '(', ')'],
                        ['\+','\.','\[','\]','\$', '\(', '\)']):
            bname2 = bname2.replace(a, b)
            
        chunkre = re.compile(bname2 + '-' + '[0-9]+')
        
        chunkfiles = []
        for f in os.listdir("."):
            print f
            if chunkre.match(f):
                chunkfiles.append(f)


        print 'Number of chunks', len(chunkfiles), '\n'
        chunkfiles.sort(self.sort_index)

        data=''
        for f in chunkfiles:

            try:
                print 'Appending chunk', os.path.join(".", f)
                data += open(f, 'rb').read()
            except (OSError, IOError, EOFError), e:
                print e
                continue

        try:
            f = open(bname, 'wb')
            f.write(data)
            f.close()
        except (OSError, IOError, EOFError), e:
            raise FileSplitterException, str(e)

        print 'Wrote file', bname

def main():
    import sys

    if len(sys.argv)<2:
        sys.exit(usage())
        
    fsp = FileSplitter()
    fsp.parseOptions(sys.argv[1:])
    fsp.do_work()

if __name__=="__main__":
    main()

Discussion:

Often we need to split big files into many chunks
either for saving them to disks, uploading to a web-site
or for some other reason. I used to rely on 3rd party
programs before for this task, but never could find a program
handy when needed.

As usual python excels in such 'scripting' tasks and this
script makes the job a breeze. :-)



Add comment

Number of comments: 5

Refactored..., Anand Pillai, 2003/10/17


# Modified on 18th Oct
I have modified this recipe to print a Usage string,
and it now takes command line options. While combining
it does not need the number of chunks and works with
all the chunks in the current directory.

-Anand

Add comment

Files larger than physical memory on machine, Anthony DiGregorio, 2004/03/24
I made a slight modification to the combine function. See how I appended "data" to the file from each chuck, instead of holding a large "data" object in memory.
Add comment

File size larger than Physical Memory available, Anthony DiGregorio, 2004/03/24
Here is the code:

        try:
            cmbf = open(bname, 'ab')
            for f in chunkfiles:

                try:
                    print 'Appending chunk', os.path.join(".", f)
                    #data += open(f, 'rb').read()
                    data = open(f, 'rb').read()
                    cmbf.write(data)
                except (OSError, IOError, EOFError), e:
                    print e
                    continue
            cmbf.close()

        except (OSError, IOError, EOFError), e:
            raise FileSplitterException, str(e)            

        #try:
        #    f = open(bname, 'wb')
        #    f.write(data)
        #    f.close()
        #except (OSError, IOError, EOFError), e:
        #    raise FileSplitterException, str(e)

        print 'Wrote file', bname

Add comment

constant filename length, jc Not specified, 2005/02/05
Here is the code to get 3 digits for numeration of each file (for split mode, near of line #107 for me):

chunkfilename = bname + ('-%03d' % (x+1)) + self.__postfix

Add comment

Output to the same directory, jc Not specified, 2005/02/05
Here is the code to generate each files in the same directory the the source (near line #94 for me):

        #bname = (os.path.split(self.__filename))[1]
        ## output to the same directory
        bname = self.__filename

Add comment

SEARCH
advanced | search help



Highest rated recipes:

1. Network Ping Pong using ...

2. SOLVING THE METACLASS ...

3. The Secret Name of List ...

4. Metaclass for Interface ...

5. Povray for python

6. Calling Windows API ...

7. Watching a directory ...

8. Generic filter logic ...

9. Implementation of sets ...

10. HTML colors to/from RGB ...



Privacy Policy | Email Opt-out | Feedback | Syndication
© 2005 ActiveState, a division of Sophos All rights reserved