Skip to content

Commit ff12d64

Browse files
committed
fix for issue #1 and accomodates usage as command line interface or library
1 parent 53bf678 commit ff12d64

File tree

4 files changed

+248
-242
lines changed

4 files changed

+248
-242
lines changed

odp2md/__init__.py

Whitespace-only changes.

odp2md/__main__.py

Lines changed: 2 additions & 239 deletions
Original file line numberDiff line numberDiff line change
@@ -1,239 +1,2 @@
1-
#!/usr/bin/env python3
2-
3-
"""
4-
5-
odp2md 2021.5.0
6-
7-
ODP2Pandoc is a tiny tool to convert
8-
OpenDocument formatted presentations (ODP)
9-
to Pandocs' Markdown.
10-
11-
(c) Copyright 2019-2021 Hartmut Seichter
12-
13-
This program is free software: you can redistribute it and/or modify
14-
it under the terms of the GNU General Public License as published by
15-
the Free Software Foundation, either version 3 of the License, or
16-
(at your option) any later version.
17-
18-
This program is distributed in the hope that it will be useful,
19-
but WITHOUT ANY WARRANTY; without even the implied warranty of
20-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21-
GNU General Public License for more details.
22-
23-
You should have received a copy of the GNU General Public License
24-
along with this program. If not, see <https://www.gnu.org/licenses/>.
25-
26-
Usage:
27-
28-
$> python odp2md --input <myslide.odp>
29-
30-
"""
31-
32-
import os
33-
import zipfile
34-
import argparse
35-
import sys
36-
import re, unicodedata
37-
import textwrap
38-
from enum import Enum
39-
import xml.dom.minidom as dom
40-
41-
class Slide:
42-
def __init__(self):
43-
self.title = ''
44-
self.text = ""
45-
self.notes = ""
46-
self.media = []
47-
48-
def generateMarkdown(self,blockToHTML=True):
49-
# fix identation
50-
self.text = textwrap.dedent(self.text)
51-
out = "## {0}\n\n{1}\n".format(self.title,self.text)
52-
for m,v in self.media:
53-
54-
# maybe let everything else fail?
55-
isVideo = any(x in v for x in ['.mp4','.mkv'])
56-
57-
if blockToHTML and isVideo:
58-
# since LaTeX extensions for video are deprecated
59-
out += "`![]({0})`{{=html}}\n".format(v)
60-
else:
61-
out += "![]({0})\n".format(v)
62-
return out
63-
64-
# override string representation
65-
def __str__(self):
66-
return self.generateMarkdown()
67-
68-
class Scope(Enum):
69-
70-
NONE = 0
71-
TITLE = 1
72-
OUTLINE = 2
73-
NOTES = 3
74-
IMAGES = 4
75-
76-
77-
class Parser:
78-
79-
def __init__(self):
80-
self.slides = []
81-
self.currentSlide = None
82-
self.currentText = ""
83-
self.currentDepth = 0
84-
self.currentScope = Scope.NONE
85-
self.mediaDirectory = 'media'
86-
87-
def getTextFromNode(self,node):
88-
if node.nodeType == node.TEXT_NODE and len(str(node.data)) > 0:
89-
return node.data
90-
return None
91-
92-
def hasAttributeWithValue(self,node,name,value):
93-
if node.attributes == None:
94-
return False
95-
for attribute_name,attribute_value in node.attributes.items():
96-
if attribute_name == name and attribute_value == value:
97-
return True
98-
return False
99-
100-
def debugNode(self,node):
101-
# print('node ', node.tagName)
102-
pass
103-
104-
def handlePage(self,node):
105-
# set new current slide
106-
self.currentSlide = Slide()
107-
self.currentSlide.name = node.attributes['draw:name']
108-
# parse
109-
self.handleNode(node)
110-
# store
111-
self.slides.append(self.currentSlide)
112-
113-
114-
def slugify(self,value, allow_unicode=False):
115-
"""
116-
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
117-
Remove characters that aren't alphanumerics, underscores, or hyphens.
118-
Convert to lowercase. Also strip leading and trailing whitespace.
119-
"""
120-
value = str(value)
121-
if allow_unicode:
122-
value = unicodedata.normalize('NFKC', value)
123-
else:
124-
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
125-
value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
126-
return re.sub(r'[-\s]+', '-', value)
127-
128-
def handleNode(self,node):
129-
130-
if self.hasAttributeWithValue(node,"presentation:class","title"):
131-
self.currentScope = Scope.TITLE
132-
elif self.hasAttributeWithValue(node,"presentation:class","outline"):
133-
self.currentScope = Scope.OUTLINE
134-
135-
if node.nodeName in ['draw:image', 'draw:plugin']:
136-
for k,v in node.attributes.items():
137-
if k == 'xlink:href':
138-
# get the extension
139-
name,ext = os.path.splitext(v)
140-
ext = ext.lower()
141-
# now we create a new slug name for conversion
142-
slug = self.slugify(self.currentSlide.title)
143-
if len(slug) < 1:
144-
slug = "slide-" + str(len(self.slides)) + "-image"
145-
slug += "-" + str(len(self.currentSlide.media))
146-
slug = (slug[:128]) if len(slug) > 128 else slug # truncate
147-
148-
self.currentSlide.media.append((v,os.path.join(self.mediaDirectory,slug+ext)))
149-
150-
151-
t = self.getTextFromNode(node)
152-
153-
if t != None:
154-
if self.currentScope == Scope.OUTLINE:
155-
self.currentText += (' ' * self.currentDepth) + '- ' + t + "\n"
156-
elif self.currentScope == Scope.TITLE:
157-
self.currentSlide.title += t
158-
elif self.currentScope == Scope.IMAGES:
159-
pass
160-
# print('image title ',t)
161-
162-
for c in node.childNodes:
163-
self.currentDepth += 1
164-
self.handleNode(c)
165-
self.currentDepth -= 1
166-
167-
168-
def handleDocument(self,dom):
169-
# we only need the pages
170-
pages = dom.getElementsByTagName("draw:page")
171-
# iterate pages
172-
for page in pages:
173-
174-
self.currentDepth = 0
175-
self.currentSlide = Slide()
176-
self.handleNode(page)
177-
self.currentSlide.text = self.currentText
178-
self.slides.append(self.currentSlide)
179-
180-
self.currentText = ""
181-
182-
183-
def open(self,fname,mediaDir='media',markdown = False,mediaExtraction = False):
184-
185-
self.mediaDirectory = mediaDir
186-
187-
# open odp file
188-
with zipfile.ZipFile(fname) as odp:
189-
info = odp.infolist()
190-
for i in info:
191-
if (i.filename == 'content.xml'):
192-
with odp.open('content.xml') as index:
193-
doc = dom.parseString(index.read())
194-
self.handleDocument(doc)
195-
196-
197-
# output markdown
198-
if markdown == True:
199-
for slide in self.slides:
200-
print(slide)
201-
202-
# generate files
203-
if mediaExtraction == True:
204-
for slide in self.slides:
205-
for m,v in slide.media:
206-
try:
207-
odp.extract(m,'.')
208-
if not os.path.exists(self.mediaDirectory):
209-
os.makedirs(self.mediaDirectory)
210-
os.rename(os.path.join('.',m),v)
211-
except KeyError:
212-
print("error finding media file ",m)
213-
214-
215-
216-
217-
def main():
218-
argument_parser = argparse.ArgumentParser(description='OpenDocument Presentation converter')
219-
220-
argument_parser.add_argument("-i","--input", required=True,help="ODP file to parse and extract")
221-
argument_parser.add_argument("-m","--markdown", help="generate Markdown files", action='store_true')
222-
argument_parser.add_argument("-b","--blocks", help="generate pandoc blocks for video files", action='store_true')
223-
argument_parser.add_argument("-x","--extract", help="extract media files", action='store_true')
224-
argument_parser.add_argument("--mediadir", required=False,default='media',help="output directory for linked media")
225-
226-
args = argument_parser.parse_args()
227-
228-
# print(args)
229-
# return
230-
231-
juicer = Parser()
232-
if 'input' in args:
233-
juicer.open(args.input,args.mediadir,args.markdown,args.extract)
234-
else:
235-
argument_parser.print_help()
236-
return
237-
238-
if __name__ == '__main__':
239-
main()
1+
from .odp2md import main
2+
main()

0 commit comments

Comments
 (0)