18
18
from sparknlp .internal import AnnotatorTransformer
19
19
from sparknlp .partition .partition_properties import *
20
20
21
+
21
22
class Reader2Doc (
22
23
AnnotatorTransformer ,
23
24
HasEmailReaderProperties ,
@@ -26,16 +27,16 @@ class Reader2Doc(
26
27
HasPowerPointProperties ,
27
28
HasTextReaderProperties ,
28
29
):
29
-
30
30
"""
31
- The Reader2Doc annotator allows you to use reading files more smoothly within existing
32
- Spark NLP workflows, enabling seamless reuse of your pipelines.
31
+ The Reader2Doc annotator allows you to use reading files more smoothly within existing
32
+ Spark NLP workflows, enabling seamless reuse of your pipelines.
33
33
34
- Reader2Doc can be used for extracting structured content from various document types
35
- using Spark NLP readers. It supports reading from many file types and returns parsed
36
- output as a structured Spark DataFrame.
34
+ Reader2Doc can be used for extracting structured content from various document types
35
+ using Spark NLP readers. It supports reading from many file types and returns parsed
36
+ output as a structured Spark DataFrame.
37
37
38
- Supported formats include:
38
+ Supported formats include:
39
+
39
40
- Plain text
40
41
- HTML
41
42
- Word (.doc/.docx)
@@ -44,79 +45,74 @@ class Reader2Doc(
44
45
- Email files (.eml, .msg)
45
46
- PDFs
46
47
47
- Example:
48
- from johnsnowlabs.reader import Reader2Doc
49
- from johnsnowlabs.nlp.base import DocumentAssembler
50
- from pyspark.ml import Pipeline
51
-
52
- # Initialize Reader2Doc for PDF files
53
- reader2doc = Reader2Doc() \
54
- .setContentType("application/pdf") \
55
- .setContentPath(f"{pdf_directory}/")
56
-
57
- # Build the pipeline with the Reader2Doc stage
58
- pipeline = Pipeline(stages=[reader2doc])
59
-
60
- # Fit the pipeline to an empty DataFrame
61
- pipeline_model = pipeline.fit(empty_data_set)
62
- result_df = pipeline_model.transform(empty_data_set)
63
-
64
- # Show the resulting DataFrame
65
- result_df.show()
66
-
67
- # Output Example:
68
- # +------------------------------------------------------------------------------------------------------------------------------------+
69
- # |document |
70
- # +------------------------------------------------------------------------------------------------------------------------------------+
71
- # |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
72
- # |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
73
- # |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
74
- # +------------------------------------------------------------------------------------------------------------------------------------+
48
+ Examples
49
+ --------
50
+ >>> from johnsnowlabs.reader import Reader2Doc
51
+ >>> from johnsnowlabs.nlp.base import DocumentAssembler
52
+ >>> from pyspark.ml import Pipeline
53
+ >>> # Initialize Reader2Doc for PDF files
54
+ >>> reader2doc = Reader2Doc() \\
55
+ ... .setContentType("application/pdf") \\
56
+ ... .setContentPath(f"{pdf_directory}/")
57
+ >>> # Build the pipeline with the Reader2Doc stage
58
+ >>> pipeline = Pipeline(stages=[reader2doc])
59
+ >>> # Fit the pipeline to an empty DataFrame
60
+ >>> pipeline_model = pipeline.fit(empty_data_set)
61
+ >>> result_df = pipeline_model.transform(empty_data_set)
62
+ >>> # Show the resulting DataFrame
63
+ >>> result_df.show()
64
+ +------------------------------------------------------------------------------------------------------------------------------------+
65
+ |document |
66
+ +------------------------------------------------------------------------------------------------------------------------------------+
67
+ |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
68
+ |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69
+ |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70
+ +------------------------------------------------------------------------------------------------------------------------------------+
75
71
"""
76
72
77
- name = ' Reader2Doc'
73
+ name = " Reader2Doc"
78
74
outputAnnotatorType = AnnotatorType .DOCUMENT
79
75
80
76
contentPath = Param (
81
77
Params ._dummy (),
82
78
"contentPath" ,
83
79
"contentPath path to files to read" ,
84
- typeConverter = TypeConverters .toString
80
+ typeConverter = TypeConverters .toString ,
85
81
)
86
82
87
83
outputCol = Param (
88
84
Params ._dummy (),
89
85
"outputCol" ,
90
86
"output column name" ,
91
- typeConverter = TypeConverters .toString
87
+ typeConverter = TypeConverters .toString ,
92
88
)
93
89
94
90
contentType = Param (
95
91
Params ._dummy (),
96
92
"contentType" ,
97
93
"Set the content type to load following MIME specification" ,
98
- typeConverter = TypeConverters .toString
94
+ typeConverter = TypeConverters .toString ,
99
95
)
100
96
101
97
explodeDocs = Param (
102
98
Params ._dummy (),
103
99
"explodeDocs" ,
104
100
"whether to explode the documents into separate rows" ,
105
- typeConverter = TypeConverters .toBoolean
101
+ typeConverter = TypeConverters .toBoolean ,
106
102
)
107
103
108
104
flattenOutput = Param (
109
105
Params ._dummy (),
110
106
"flattenOutput" ,
111
107
"If true, output is flattened to plain text with minimal metadata" ,
112
- typeConverter = TypeConverters .toBoolean
108
+ typeConverter = TypeConverters .toBoolean ,
113
109
)
114
110
115
111
titleThreshold = Param (
116
112
Params ._dummy (),
117
113
"titleThreshold" ,
118
114
"Minimum font size threshold for title detection in PDF docs" ,
119
- typeConverter = TypeConverters .toFloat
115
+ typeConverter = TypeConverters .toFloat ,
120
116
)
121
117
122
118
@keyword_only
@@ -189,4 +185,4 @@ def setTitleThreshold(self, value):
189
185
value : float
190
186
Minimum font size threshold for title detection in PDF docs
191
187
"""
192
- return self ._set (titleThreshold = value )
188
+ return self ._set (titleThreshold = value )
0 commit comments