1
- #!/usr/bin/env python3
2
-
3
- """
4
-
5
- odp2md 2021.5.0
6
-
7
- ODP2Pandoc is a tiny tool to convert
8
- OpenDocument formatted presentations (ODP)
9
- to Pandocs' Markdown.
10
-
11
- (c) Copyright 2019-2021 Hartmut Seichter
12
-
13
- This program is free software: you can redistribute it and/or modify
14
- it under the terms of the GNU General Public License as published by
15
- the Free Software Foundation, either version 3 of the License, or
16
- (at your option) any later version.
17
-
18
- This program is distributed in the hope that it will be useful,
19
- but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21
- GNU General Public License for more details.
22
-
23
- You should have received a copy of the GNU General Public License
24
- along with this program. If not, see <https://www.gnu.org/licenses/>.
25
-
26
- Usage:
27
-
28
- $> python odp2md --input <myslide.odp>
29
-
30
- """
31
-
32
- import os
33
- import zipfile
34
- import argparse
35
- import sys
36
- import re , unicodedata
37
- import textwrap
38
- from enum import Enum
39
- import xml .dom .minidom as dom
40
-
41
- class Slide :
42
- def __init__ (self ):
43
- self .title = ''
44
- self .text = ""
45
- self .notes = ""
46
- self .media = []
47
-
48
- def generateMarkdown (self ,blockToHTML = True ):
49
- # fix identation
50
- self .text = textwrap .dedent (self .text )
51
- out = "## {0}\n \n {1}\n " .format (self .title ,self .text )
52
- for m ,v in self .media :
53
-
54
- # maybe let everything else fail?
55
- isVideo = any (x in v for x in ['.mp4' ,'.mkv' ])
56
-
57
- if blockToHTML and isVideo :
58
- # since LaTeX extensions for video are deprecated
59
- out += "``{{=html}}\n " .format (v )
60
- else :
61
- out += "\n " .format (v )
62
- return out
63
-
64
- # override string representation
65
- def __str__ (self ):
66
- return self .generateMarkdown ()
67
-
68
- class Scope (Enum ):
69
-
70
- NONE = 0
71
- TITLE = 1
72
- OUTLINE = 2
73
- NOTES = 3
74
- IMAGES = 4
75
-
76
-
77
- class Parser :
78
-
79
- def __init__ (self ):
80
- self .slides = []
81
- self .currentSlide = None
82
- self .currentText = ""
83
- self .currentDepth = 0
84
- self .currentScope = Scope .NONE
85
- self .mediaDirectory = 'media'
86
-
87
- def getTextFromNode (self ,node ):
88
- if node .nodeType == node .TEXT_NODE and len (str (node .data )) > 0 :
89
- return node .data
90
- return None
91
-
92
- def hasAttributeWithValue (self ,node ,name ,value ):
93
- if node .attributes == None :
94
- return False
95
- for attribute_name ,attribute_value in node .attributes .items ():
96
- if attribute_name == name and attribute_value == value :
97
- return True
98
- return False
99
-
100
- def debugNode (self ,node ):
101
- # print('node ', node.tagName)
102
- pass
103
-
104
- def handlePage (self ,node ):
105
- # set new current slide
106
- self .currentSlide = Slide ()
107
- self .currentSlide .name = node .attributes ['draw:name' ]
108
- # parse
109
- self .handleNode (node )
110
- # store
111
- self .slides .append (self .currentSlide )
112
-
113
-
114
- def slugify (self ,value , allow_unicode = False ):
115
- """
116
- Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
117
- Remove characters that aren't alphanumerics, underscores, or hyphens.
118
- Convert to lowercase. Also strip leading and trailing whitespace.
119
- """
120
- value = str (value )
121
- if allow_unicode :
122
- value = unicodedata .normalize ('NFKC' , value )
123
- else :
124
- value = unicodedata .normalize ('NFKD' , value ).encode ('ascii' , 'ignore' ).decode ('ascii' )
125
- value = re .sub (r'[^\w\s-]' , '' , value .lower ()).strip ()
126
- return re .sub (r'[-\s]+' , '-' , value )
127
-
128
- def handleNode (self ,node ):
129
-
130
- if self .hasAttributeWithValue (node ,"presentation:class" ,"title" ):
131
- self .currentScope = Scope .TITLE
132
- elif self .hasAttributeWithValue (node ,"presentation:class" ,"outline" ):
133
- self .currentScope = Scope .OUTLINE
134
-
135
- if node .nodeName in ['draw:image' , 'draw:plugin' ]:
136
- for k ,v in node .attributes .items ():
137
- if k == 'xlink:href' :
138
- # get the extension
139
- name ,ext = os .path .splitext (v )
140
- ext = ext .lower ()
141
- # now we create a new slug name for conversion
142
- slug = self .slugify (self .currentSlide .title )
143
- if len (slug ) < 1 :
144
- slug = "slide-" + str (len (self .slides )) + "-image"
145
- slug += "-" + str (len (self .currentSlide .media ))
146
- slug = (slug [:128 ]) if len (slug ) > 128 else slug # truncate
147
-
148
- self .currentSlide .media .append ((v ,os .path .join (self .mediaDirectory ,slug + ext )))
149
-
150
-
151
- t = self .getTextFromNode (node )
152
-
153
- if t != None :
154
- if self .currentScope == Scope .OUTLINE :
155
- self .currentText += (' ' * self .currentDepth ) + '- ' + t + "\n "
156
- elif self .currentScope == Scope .TITLE :
157
- self .currentSlide .title += t
158
- elif self .currentScope == Scope .IMAGES :
159
- pass
160
- # print('image title ',t)
161
-
162
- for c in node .childNodes :
163
- self .currentDepth += 1
164
- self .handleNode (c )
165
- self .currentDepth -= 1
166
-
167
-
168
- def handleDocument (self ,dom ):
169
- # we only need the pages
170
- pages = dom .getElementsByTagName ("draw:page" )
171
- # iterate pages
172
- for page in pages :
173
-
174
- self .currentDepth = 0
175
- self .currentSlide = Slide ()
176
- self .handleNode (page )
177
- self .currentSlide .text = self .currentText
178
- self .slides .append (self .currentSlide )
179
-
180
- self .currentText = ""
181
-
182
-
183
- def open (self ,fname ,mediaDir = 'media' ,markdown = False ,mediaExtraction = False ):
184
-
185
- self .mediaDirectory = mediaDir
186
-
187
- # open odp file
188
- with zipfile .ZipFile (fname ) as odp :
189
- info = odp .infolist ()
190
- for i in info :
191
- if (i .filename == 'content.xml' ):
192
- with odp .open ('content.xml' ) as index :
193
- doc = dom .parseString (index .read ())
194
- self .handleDocument (doc )
195
-
196
-
197
- # output markdown
198
- if markdown == True :
199
- for slide in self .slides :
200
- print (slide )
201
-
202
- # generate files
203
- if mediaExtraction == True :
204
- for slide in self .slides :
205
- for m ,v in slide .media :
206
- try :
207
- odp .extract (m ,'.' )
208
- if not os .path .exists (self .mediaDirectory ):
209
- os .makedirs (self .mediaDirectory )
210
- os .rename (os .path .join ('.' ,m ),v )
211
- except KeyError :
212
- print ("error finding media file " ,m )
213
-
214
-
215
-
216
-
217
- def main ():
218
- argument_parser = argparse .ArgumentParser (description = 'OpenDocument Presentation converter' )
219
-
220
- argument_parser .add_argument ("-i" ,"--input" , required = True ,help = "ODP file to parse and extract" )
221
- argument_parser .add_argument ("-m" ,"--markdown" , help = "generate Markdown files" , action = 'store_true' )
222
- argument_parser .add_argument ("-b" ,"--blocks" , help = "generate pandoc blocks for video files" , action = 'store_true' )
223
- argument_parser .add_argument ("-x" ,"--extract" , help = "extract media files" , action = 'store_true' )
224
- argument_parser .add_argument ("--mediadir" , required = False ,default = 'media' ,help = "output directory for linked media" )
225
-
226
- args = argument_parser .parse_args ()
227
-
228
- # print(args)
229
- # return
230
-
231
- juicer = Parser ()
232
- if 'input' in args :
233
- juicer .open (args .input ,args .mediadir ,args .markdown ,args .extract )
234
- else :
235
- argument_parser .print_help ()
236
- return
237
-
238
- if __name__ == '__main__' :
239
- main ()
1
+ from .odp2md import main
2
+ main ()
0 commit comments