Skip to content

Commit 7e6e464

Browse files
committed
[SPARKNLP-1259] Adjust doc
1 parent a0374b7 commit 7e6e464

File tree

1 file changed

+39
-43
lines changed

1 file changed

+39
-43
lines changed

python/sparknlp/reader/reader2doc.py

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sparknlp.internal import AnnotatorTransformer
1919
from sparknlp.partition.partition_properties import *
2020

21+
2122
class Reader2Doc(
2223
AnnotatorTransformer,
2324
HasEmailReaderProperties,
@@ -26,16 +27,16 @@ class Reader2Doc(
2627
HasPowerPointProperties,
2728
HasTextReaderProperties,
2829
):
29-
3030
"""
31-
The Reader2Doc annotator allows you to use reading files more smoothly within existing
32-
Spark NLP workflows, enabling seamless reuse of your pipelines.
31+
The Reader2Doc annotator allows you to use reading files more smoothly within existing
32+
Spark NLP workflows, enabling seamless reuse of your pipelines.
3333
34-
Reader2Doc can be used for extracting structured content from various document types
35-
using Spark NLP readers. It supports reading from many file types and returns parsed
36-
output as a structured Spark DataFrame.
34+
Reader2Doc can be used for extracting structured content from various document types
35+
using Spark NLP readers. It supports reading from many file types and returns parsed
36+
output as a structured Spark DataFrame.
3737
38-
Supported formats include:
38+
Supported formats include:
39+
3940
- Plain text
4041
- HTML
4142
- Word (.doc/.docx)
@@ -44,79 +45,74 @@ class Reader2Doc(
4445
- Email files (.eml, .msg)
4546
- PDFs
4647
47-
Example:
48-
from johnsnowlabs.reader import Reader2Doc
49-
from johnsnowlabs.nlp.base import DocumentAssembler
50-
from pyspark.ml import Pipeline
51-
52-
# Initialize Reader2Doc for PDF files
53-
reader2doc = Reader2Doc() \
54-
.setContentType("application/pdf") \
55-
.setContentPath(f"{pdf_directory}/")
56-
57-
# Build the pipeline with the Reader2Doc stage
58-
pipeline = Pipeline(stages=[reader2doc])
59-
60-
# Fit the pipeline to an empty DataFrame
61-
pipeline_model = pipeline.fit(empty_data_set)
62-
result_df = pipeline_model.transform(empty_data_set)
63-
64-
# Show the resulting DataFrame
65-
result_df.show()
66-
67-
# Output Example:
68-
# +------------------------------------------------------------------------------------------------------------------------------------+
69-
# |document |
70-
# +------------------------------------------------------------------------------------------------------------------------------------+
71-
# |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
72-
# |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
73-
# |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
74-
# +------------------------------------------------------------------------------------------------------------------------------------+
48+
Examples
49+
--------
50+
>>> from johnsnowlabs.reader import Reader2Doc
51+
>>> from johnsnowlabs.nlp.base import DocumentAssembler
52+
>>> from pyspark.ml import Pipeline
53+
>>> # Initialize Reader2Doc for PDF files
54+
>>> reader2doc = Reader2Doc() \\
55+
... .setContentType("application/pdf") \\
56+
... .setContentPath(f"{pdf_directory}/")
57+
>>> # Build the pipeline with the Reader2Doc stage
58+
>>> pipeline = Pipeline(stages=[reader2doc])
59+
>>> # Fit the pipeline to an empty DataFrame
60+
>>> pipeline_model = pipeline.fit(empty_data_set)
61+
>>> result_df = pipeline_model.transform(empty_data_set)
62+
>>> # Show the resulting DataFrame
63+
>>> result_df.show()
64+
+------------------------------------------------------------------------------------------------------------------------------------+
65+
|document |
66+
+------------------------------------------------------------------------------------------------------------------------------------+
67+
|[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
68+
|[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69+
|[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70+
+------------------------------------------------------------------------------------------------------------------------------------+
7571
"""
7672

77-
name = 'Reader2Doc'
73+
name = "Reader2Doc"
7874
outputAnnotatorType = AnnotatorType.DOCUMENT
7975

8076
contentPath = Param(
8177
Params._dummy(),
8278
"contentPath",
8379
"contentPath path to files to read",
84-
typeConverter=TypeConverters.toString
80+
typeConverter=TypeConverters.toString,
8581
)
8682

8783
outputCol = Param(
8884
Params._dummy(),
8985
"outputCol",
9086
"output column name",
91-
typeConverter=TypeConverters.toString
87+
typeConverter=TypeConverters.toString,
9288
)
9389

9490
contentType = Param(
9591
Params._dummy(),
9692
"contentType",
9793
"Set the content type to load following MIME specification",
98-
typeConverter=TypeConverters.toString
94+
typeConverter=TypeConverters.toString,
9995
)
10096

10197
explodeDocs = Param(
10298
Params._dummy(),
10399
"explodeDocs",
104100
"whether to explode the documents into separate rows",
105-
typeConverter=TypeConverters.toBoolean
101+
typeConverter=TypeConverters.toBoolean,
106102
)
107103

108104
flattenOutput = Param(
109105
Params._dummy(),
110106
"flattenOutput",
111107
"If true, output is flattened to plain text with minimal metadata",
112-
typeConverter=TypeConverters.toBoolean
108+
typeConverter=TypeConverters.toBoolean,
113109
)
114110

115111
titleThreshold = Param(
116112
Params._dummy(),
117113
"titleThreshold",
118114
"Minimum font size threshold for title detection in PDF docs",
119-
typeConverter=TypeConverters.toFloat
115+
typeConverter=TypeConverters.toFloat,
120116
)
121117

122118
@keyword_only
@@ -189,4 +185,4 @@ def setTitleThreshold(self, value):
189185
value : float
190186
Minimum font size threshold for title detection in PDF docs
191187
"""
192-
return self._set(titleThreshold=value)
188+
return self._set(titleThreshold=value)

0 commit comments

Comments
 (0)