''' Created on 25 Mar 2010 ''' import httplib import os import sys from optparse import OptionParser import time, thread from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter, A4 from reportlab.lib.units import cm, mm, inch, pica write = sys.stdout.write h = httplib.HTTPConnection('webproxy',8080) headers = {'Cookie':''} size = 1800 urltemplate = ' /%s/pages/%i.jpg?width=1800' books = ('isbn2','isbn1') maxPages = 10000 def downloadBookImages(folderLocation,isbn): bookLocation = folderLocation + "\\%s" % isbn print 'Downloading book to %s' % bookLocation #Check if the folder location exists if os.path.exists(bookLocation) == False: os.mkdir(bookLocation) # If not create the folder h = httplib.HTTPConnection('webproxy',8080) imageFilePathTemplate = bookLocation + "\\%i.jpg" # Loop through all the pages for the book and save the page for page in range(maxPages): # Create the image file path imageFilePath = imageFilePathTemplate % page; # Check if the file exists if os.path.exists(imageFilePath) == True: write('E%i ' % page) continue url = urltemplate % (isbn,page) h.request('GET', url,headers=headers) write('.') responce = h.getresponse() if responce.status == 401: print 'Not Authorized Book %s, the cookie has expired' % isbn break; data = responce.read() if len(data) < 1000: print 'End of book %s since the page size is too small : %i' % (isbn,len(data)) break f = open(imageFilePath,'wb') f.write(data) f.close() def pdfDirectory(outputPDFName, imageDirectory ): dirim = str(imageDirectory) output = str(outputPDFName) print 'Converting to pdf %s, images %s' % (output,dirim) width, height = A4 height, width = A4 c = canvas.Canvas(output, pagesize=A4) for root, dirs, files in os.walk(dirim): nopage = len(files) for name in range(nopage): #print name/nopage write('.') name = str(name) + ".jpg" filepath = os.path.join(root, name) #if filepath.endswith('20.jpg'): break c.drawImage(filepath, mm * 0.001 , mm * 0.001, height, width, preserveAspectRatio=False) ##c.showPage() c.save() print "PDF of Image directory created %s" % outputPDFName def threadExe(location,isbn): print 'Executing Thread for location %s book %s\n' % (location,isbn) for page in range(maxPages): print '%s Page - %i \n' % (isbn,page) time.sleep(10) def createOptionParser(): parser = OptionParser(usage="Usage: %prog [options]", version="%prog 1.0") parser.add_option("-i", "--isbn", dest="isbn", action='store', type='string') parser.add_option("-d", action="store_true", dest="download", help="Download the book images", default=False) parser.add_option("-c", action="store_true", dest="converToPdf", help="Convert images to pdf file", default=False) parser.add_option("-f", "--imageFolder", type="string", help="Image folder to download or to read from.") parser.add_option("-p", "--pdfOutputFolder", type="string", help="Pdf output folder.") return parser if __name__ == '__main__': parser = createOptionParser() (options, args) = parser.parse_args() if not options.isbn: parser.error("You have to specify the isbn book number") if options.download or options.converToPdf: if not options.imageFolder: parser.error("Specify the image folder") if options.download: downloadBookImages(options.imageFolder,options.isbn) if options.converToPdf: if not options.pdfOutputFolder: parser.error("Specify the pdf folder") print 'Converting to pdf %s, images %s' % (options.pdfOutputFolder,options.imageFolder) pdfDirectory(options.pdfOutputFolder + options.isbn + ".pdf" , options.imageFolder) else: print 'Specify if you would like to download or convert the book or both?' # # for book in books: # downloadBookImages('c:\\tmp',book) # # for book in books: # pdfDirectory('w:\\%s' % book , 'c:\\tmp\\%s.pdf' % book) #
Friday, April 16, 2010
Python application for downloading a book and converting to pdf
A similar application in python for downloading book images from the net and converting it to a pdf file, but this time using python
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment