Skip to content

Commit 6ecee5e

Browse files
committed
Setting default explodeDocs to false in Reader2Doc
1 parent 650f992 commit 6ecee5e

File tree

2 files changed

+11
-13
lines changed

2 files changed

+11
-13
lines changed

src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ class Reader2Doc(override val uid: String)
114114
}
115115

116116
setDefault(
117-
this.explodeDocs -> true,
117+
this.explodeDocs -> false,
118118
contentType -> "",
119119
flattenOutput -> false,
120120
titleThreshold -> 18)

src/test/scala/com/johnsnowlabs/reader/Reader2DocTest.scala

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
4242
val pipelineModel = pipeline.fit(emptyDataSet)
4343
val resultDf = pipelineModel.transform(emptyDataSet)
4444

45-
assert(resultDf.count() > 1)
45+
assert(resultDf.count() == 1)
4646
}
4747

4848
it should "output clean flatten text without any structured metadata" taggedAs FastTest in {
@@ -52,7 +52,6 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
5252
.setContentPath(s"$htmlFilesDirectory/example-div.html")
5353
.setOutputCol("document")
5454
.setFlattenOutput(true)
55-
.setExplodeDocs(false)
5655

5756
val pipeline = new Pipeline().setStages(Array(reader2Doc))
5857

@@ -91,20 +90,20 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
9190
}
9291
}
9392

94-
it should "convert Reader output to Document format with one row per document" taggedAs FastTest in {
93+
it should "convert Reader output to Document format with exploded documents" taggedAs FastTest in {
9594

9695
val reader2Doc = new Reader2Doc()
9796
.setContentType("text/html")
9897
.setContentPath(s"$htmlFilesDirectory/example-div.html")
9998
.setOutputCol("document")
100-
.setExplodeDocs(false)
99+
.setExplodeDocs(true)
101100

102101
val pipeline = new Pipeline().setStages(Array(reader2Doc))
103102

104103
val pipelineModel = pipeline.fit(emptyDataSet)
105104
val resultDf = pipelineModel.transform(emptyDataSet)
106105

107-
assert(resultDf.count() == 1)
106+
assert(resultDf.count() > 1)
108107
}
109108

110109
it should "work with Tokenizer" taggedAs FastTest in {
@@ -116,9 +115,8 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
116115

117116
val pipelineModel = pipeline.fit(emptyDataSet)
118117
val resultDf = pipelineModel.transform(emptyDataSet)
119-
resultDf.select("document").show(truncate = false)
120118

121-
assert(resultDf.count() > 1)
119+
assert(resultDf.count() == 1)
122120
}
123121

124122
it should "work for Text documents" taggedAs FastTest in {
@@ -132,7 +130,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
132130
val pipelineModel = pipeline.fit(emptyDataSet)
133131
val resultDf = pipelineModel.transform(emptyDataSet)
134132

135-
assert(resultDf.count() > 1)
133+
assert(resultDf.count() == 1)
136134
}
137135

138136
it should "work for Word documents" taggedAs FastTest in {
@@ -146,7 +144,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
146144
val pipelineModel = pipeline.fit(emptyDataSet)
147145
val resultDf = pipelineModel.transform(emptyDataSet)
148146

149-
assert(resultDf.count() > 1)
147+
assert(resultDf.count() == 1)
150148
}
151149

152150
it should "work with PDF documents" taggedAs FastTest in {
@@ -160,7 +158,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
160158
val pipelineModel = pipeline.fit(emptyDataSet)
161159
val resultDf = pipelineModel.transform(emptyDataSet)
162160

163-
assert(resultDf.count() > 1)
161+
assert(resultDf.count() == 1)
164162
}
165163

166164
it should "work with Markdown" taggedAs FastTest in {
@@ -174,7 +172,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
174172
val pipelineModel = pipeline.fit(emptyDataSet)
175173
val resultDf = pipelineModel.transform(emptyDataSet)
176174

177-
assert(resultDf.count() > 1)
175+
assert(resultDf.count() == 1)
178176
}
179177

180178
it should "work with XML" taggedAs FastTest in {
@@ -188,7 +186,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
188186
val pipelineModel = pipeline.fit(emptyDataSet)
189187
val resultDf = pipelineModel.transform(emptyDataSet)
190188

191-
assert(resultDf.count() > 1)
189+
assert(resultDf.count() == 1)
192190
}
193191

194192
it should "throw if contentPath is not set" taggedAs FastTest in {

0 commit comments

Comments
 (0)