Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ Contributors
* Lars Hupfeldt Nielsen - OpenSSL FIPS mode md5 bug fix
* Louis Maddox -- better docstrings
* Łukasz Langa -- partial support for autodoc
* Lukas Wieg -- JavaScript search improvement
* Marco Buttu -- doctest extension (pyversion option)
* Mark Ostroth -- semantic HTML contributions
* Martin Hans -- autodoc improvements
Expand Down
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ Bugs fixed
for objects documented as ``:py:data:`` to be hyperlinked in function signatures.
* #13858: doctest: doctest blocks are now correctly added to a group defined by the
configuration variable ``doctest_test_doctest_blocks``.
* #13892: search: support word exclusion in the search by prefixing words with "-".


Testing
Expand Down
46 changes: 31 additions & 15 deletions sphinx/themes/basic/static/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,28 @@ const _orderResultsByScoreThenName = (a, b) => {
* Default splitQuery function. Can be overridden in ``sphinx.search`` with a
* custom function per language.
*
* The regular expression works by splitting the string on consecutive characters
* that are not Unicode letters, numbers, underscores, or emoji characters.
* This is the same as ``\W+`` in Python, preserving the surrogate pair area.
* The `consecutiveLetters` regular expression works by matching consecutive characters
* that are Unicode letters, numbers, underscores, or emoji characters.
*
* The `searchWords` regular expression works by matching a word like structure
* that matches the `consecutiveLetters` with or without a leading hyphen '-' which is
* used to exclude search terms later on.
*/
if (typeof splitQuery === "undefined") {
var splitQuery = (query) =>
query
.split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
.filter((term) => term); // remove remaining empty strings
var splitQuery = (query) => {
const consecutiveLetters =
/[\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu;
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IDK if it's bad to define is regex first and reference it later in the new regex. It could also be plain text. But I thought this way, the origin of the regex can be easier understood.

const searchWords = new RegExp(
`(${consecutiveLetters.source})|\\s(-${consecutiveLetters.source})`,
"gu",
);
return Array.from(
query
.matchAll(searchWords)
.map((results) => results[1] ?? results[2]) // select one of the possible groups (e.g. "word" or "-word").
.filter((term) => term), // remove remaining empty strings.
);
};
}

/**
Expand Down Expand Up @@ -627,15 +640,18 @@ const Search = {

// ensure that none of the excluded terms is in the search result
if (
[...excludedTerms].some(
(term) =>
terms[term] === file
|| titleTerms[term] === file
|| (terms[term] || []).includes(file)
|| (titleTerms[term] || []).includes(file),
)
[...excludedTerms].some((excludedTerm) => {
// Both mappings will contain either a single integer or a list of integers.
// Converting them to lists makes the comparison more readable.
let excludedTermFiles = [].concat(terms[excludedTerm]);
let excludedTitleFiles = [].concat(titleTerms[excludedTerm]);
return (
excludedTermFiles.includes(file)
|| excludedTitleFiles.includes(file)
);
})
)
break;
continue;
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably the most important change!


// select one (max) score for the file.
const score = Math.max(...wordList.map((w) => scoreMap.get(file).get(w)));
Expand Down
36 changes: 36 additions & 0 deletions tests/js/searchtools.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,32 @@ describe("Basic html theme search", function () {
expect(Search.performTermsSearch(searchterms, excluded)).toEqual(hits);
});

it("should find results when excluded terms are used", function () {
// This fixture already existed and has the right data to make this test work.
// Replace with another matching fixture if necessary.
eval(loadFixture("titles/searchindex.js"));

// It's important that the searchterm is included in multiple pages while the
// excluded term is not included in all pages.
// In this case the ``for`` is included in the two existing pages while the ``ask``
// is only included in one page.
[_searchQuery, searchterms, excluded, ..._remainingItems] =
Search._parseQuery("for -ask");

// prettier-ignore
hits = [[
'relevance',
'Relevance',
'',
null,
2,
'relevance.rst',
'text'
]];
expect(excluded).toEqual(new Set(["ask"]));
expect(Search.performTermsSearch(searchterms, excluded)).toEqual(hits);
});

it('should partially-match "sphinx" when in title index', function () {
eval(loadFixture("partial/searchindex.js"));

Expand Down Expand Up @@ -295,6 +321,16 @@ describe("splitQuery regression tests", () => {
expect(parts).toEqual(["Pin", "Code"]);
});

it("can keep underscores in words", () => {
const parts = splitQuery("python_function");
expect(parts).toEqual(["python_function"]);
});

it("can maintain negated search words", () => {
const parts = splitQuery("Pin -Code");
expect(parts).toEqual(["Pin", "-Code"]);
});

it("can split Chinese characters", () => {
const parts = splitQuery("Hello from 中国 上海");
expect(parts).toEqual(["Hello", "from", "中国", "上海"]);
Expand Down
Loading