diff options
author | Till <2353100+S7evinK@users.noreply.github.com> | 2022-09-07 18:15:54 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-07 18:15:54 +0200 |
commit | d5876abbe9f5484768f603ec91a567b8650e6e73 (patch) | |
tree | e8288bac7557a840ed636391ce5afbc7059dc993 /internal | |
parent | 31f4ae8997af7e939f505107341b86b2abd3fd9a (diff) |
Fulltext implementation incl. config (#2480)
This adds the main component of the fulltext search.
This PR doesn't do anything yet, besides creating an empty fulltextindex
folder if enabled. Indexing events is done in a separate PR.
Diffstat (limited to 'internal')
-rw-r--r-- | internal/fulltext/bleve.go | 164 | ||||
-rw-r--r-- | internal/fulltext/bleve_test.go | 250 | ||||
-rw-r--r-- | internal/fulltext/bleve_wasm.go | 65 |
3 files changed, 479 insertions, 0 deletions
diff --git a/internal/fulltext/bleve.go b/internal/fulltext/bleve.go new file mode 100644 index 00000000..b07c0e51 --- /dev/null +++ b/internal/fulltext/bleve.go @@ -0,0 +1,164 @@ +// Copyright 2022 The Matrix.org Foundation C.I.C. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !wasm +// +build !wasm + +package fulltext + +import ( + "strings" + + "github.com/blevesearch/bleve/v2" + "github.com/blevesearch/bleve/v2/mapping" + "github.com/matrix-org/dendrite/setup/config" + "github.com/matrix-org/gomatrixserverlib" +) + +// Search contains all existing bleve.Index +type Search struct { + FulltextIndex bleve.Index +} + +// IndexElement describes the layout of an element to index +type IndexElement struct { + EventID string + RoomID string + Content string + ContentType string + StreamPosition int64 +} + +// SetContentType sets i.ContentType given an identifier +func (i *IndexElement) SetContentType(v string) { + switch v { + case "m.room.message": + i.ContentType = "content.body" + case gomatrixserverlib.MRoomName: + i.ContentType = "content.name" + case gomatrixserverlib.MRoomTopic: + i.ContentType = "content.topic" + } +} + +// New opens a new/existing fulltext index +func New(cfg config.Fulltext) (fts *Search, err error) { + fts = &Search{} + fts.FulltextIndex, err = openIndex(cfg) + if err != nil { + return nil, err + } + return fts, nil +} + +// Close closes the fulltext index +func (f *Search) Close() error { + return f.FulltextIndex.Close() +} + +// Index indexes the given elements +func (f *Search) Index(elements ...IndexElement) error { + batch := f.FulltextIndex.NewBatch() + + for _, element := range elements { + err := batch.Index(element.EventID, element) + if err != nil { + return err + } + } + return f.FulltextIndex.Batch(batch) +} + +// Delete deletes an indexed element by the eventID +func (f *Search) Delete(eventID string) error { + return f.FulltextIndex.Delete(eventID) +} + +// Search searches the index given a search term, roomIDs and keys. +func (f *Search) Search(term string, roomIDs, keys []string, limit, from int, orderByStreamPos bool) (*bleve.SearchResult, error) { + qry := bleve.NewConjunctionQuery() + termQuery := bleve.NewBooleanQuery() + + terms := strings.Split(term, " ") + for _, term := range terms { + matchQuery := bleve.NewMatchQuery(term) + matchQuery.SetField("Content") + termQuery.AddMust(matchQuery) + } + qry.AddQuery(termQuery) + + roomQuery := bleve.NewBooleanQuery() + for _, roomID := range roomIDs { + roomSearch := bleve.NewMatchQuery(roomID) + roomSearch.SetField("RoomID") + roomQuery.AddShould(roomSearch) + } + if len(roomIDs) > 0 { + qry.AddQuery(roomQuery) + } + keyQuery := bleve.NewBooleanQuery() + for _, key := range keys { + keySearch := bleve.NewMatchQuery(key) + keySearch.SetField("ContentType") + keyQuery.AddShould(keySearch) + } + if len(keys) > 0 { + qry.AddQuery(keyQuery) + } + + s := bleve.NewSearchRequestOptions(qry, limit, from, false) + s.Fields = []string{"*"} + s.SortBy([]string{"_score"}) + if orderByStreamPos { + s.SortBy([]string{"-StreamPosition"}) + } + + return f.FulltextIndex.Search(s) +} + +func openIndex(cfg config.Fulltext) (bleve.Index, error) { + m := getMapping(cfg) + if cfg.InMemory { + return bleve.NewMemOnly(m) + } + if index, err := bleve.Open(string(cfg.IndexPath)); err == nil { + return index, nil + } + + index, err := bleve.New(string(cfg.IndexPath), m) + if err != nil { + return nil, err + } + return index, nil +} + +func getMapping(cfg config.Fulltext) *mapping.IndexMappingImpl { + enFieldMapping := bleve.NewTextFieldMapping() + enFieldMapping.Analyzer = cfg.Language + + eventMapping := bleve.NewDocumentMapping() + eventMapping.AddFieldMappingsAt("Content", enFieldMapping) + eventMapping.AddFieldMappingsAt("StreamPosition", bleve.NewNumericFieldMapping()) + + // Index entries as is + idFieldMapping := bleve.NewKeywordFieldMapping() + eventMapping.AddFieldMappingsAt("ContentType", idFieldMapping) + eventMapping.AddFieldMappingsAt("RoomID", idFieldMapping) + eventMapping.AddFieldMappingsAt("EventID", idFieldMapping) + + indexMapping := bleve.NewIndexMapping() + indexMapping.AddDocumentMapping("Event", eventMapping) + indexMapping.DefaultType = "Event" + return indexMapping +} diff --git a/internal/fulltext/bleve_test.go b/internal/fulltext/bleve_test.go new file mode 100644 index 00000000..84a28242 --- /dev/null +++ b/internal/fulltext/bleve_test.go @@ -0,0 +1,250 @@ +// Copyright 2022 The Matrix.org Foundation C.I.C. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fulltext_test + +import ( + "reflect" + "testing" + + "github.com/matrix-org/gomatrixserverlib" + "github.com/matrix-org/util" + + "github.com/matrix-org/dendrite/internal/fulltext" + "github.com/matrix-org/dendrite/setup/config" +) + +func mustOpenIndex(t *testing.T, tempDir string) *fulltext.Search { + t.Helper() + cfg := config.Fulltext{} + cfg.Defaults(config.DefaultOpts{ + Generate: true, + Monolithic: true, + }) + if tempDir != "" { + cfg.IndexPath = config.Path(tempDir) + cfg.InMemory = false + } + fts, err := fulltext.New(cfg) + if err != nil { + t.Fatal("failed to open fulltext index:", err) + } + return fts +} + +func mustAddTestData(t *testing.T, fts *fulltext.Search, firstStreamPos int64) (eventIDs, roomIDs []string) { + t.Helper() + // create some more random data + var batchItems []fulltext.IndexElement + streamPos := firstStreamPos + + wantRoomID := util.RandomString(16) + + for i := 0; i < 30; i++ { + streamPos++ + eventID := util.RandomString(16) + // Create more data for the first room + if i > 15 { + wantRoomID = util.RandomString(16) + } + e := fulltext.IndexElement{ + EventID: eventID, + RoomID: wantRoomID, + Content: "lorem ipsum", + StreamPosition: streamPos, + } + e.SetContentType("m.room.message") + batchItems = append(batchItems, e) + roomIDs = append(roomIDs, wantRoomID) + eventIDs = append(eventIDs, eventID) + } + e := fulltext.IndexElement{ + EventID: util.RandomString(16), + RoomID: wantRoomID, + Content: "Roomname testing", + StreamPosition: streamPos, + } + e.SetContentType(gomatrixserverlib.MRoomName) + batchItems = append(batchItems, e) + e = fulltext.IndexElement{ + EventID: util.RandomString(16), + RoomID: wantRoomID, + Content: "Room topic fulltext", + StreamPosition: streamPos, + } + e.SetContentType(gomatrixserverlib.MRoomTopic) + batchItems = append(batchItems, e) + if err := fts.Index(batchItems...); err != nil { + t.Fatalf("failed to batch insert elements: %v", err) + } + return eventIDs, roomIDs +} + +func TestOpen(t *testing.T) { + dataDir := t.TempDir() + fts := mustOpenIndex(t, dataDir) + if err := fts.Close(); err != nil { + t.Fatal("unable to close fulltext index", err) + } + + // open existing index + fts = mustOpenIndex(t, dataDir) + defer fts.Close() +} + +func TestIndex(t *testing.T) { + fts := mustOpenIndex(t, "") + defer fts.Close() + + // add some data + var streamPos int64 = 1 + roomID := util.RandomString(8) + eventID := util.RandomString(16) + e := fulltext.IndexElement{ + EventID: eventID, + RoomID: roomID, + Content: "lorem ipsum", + StreamPosition: streamPos, + } + e.SetContentType("m.room.message") + + if err := fts.Index(e); err != nil { + t.Fatal("failed to index element", err) + } + + // create some more random data + mustAddTestData(t, fts, streamPos) +} + +func TestDelete(t *testing.T) { + fts := mustOpenIndex(t, "") + defer fts.Close() + eventIDs, roomIDs := mustAddTestData(t, fts, 0) + res1, err := fts.Search("lorem", roomIDs[:1], nil, 50, 0, false) + if err != nil { + t.Fatal(err) + } + + if err = fts.Delete(eventIDs[0]); err != nil { + t.Fatal(err) + } + + res2, err := fts.Search("lorem", roomIDs[:1], nil, 50, 0, false) + if err != nil { + t.Fatal(err) + } + + if res1.Total <= res2.Total { + t.Fatalf("got unexpected result: %d <= %d", res1.Total, res2.Total) + } +} + +func TestSearch(t *testing.T) { + type args struct { + term string + keys []string + limit int + from int + orderByStreamPos bool + roomIndex []int + } + tests := []struct { + name string + args args + wantCount int + wantErr bool + }{ + { + name: "Can search for many results in one room", + wantCount: 16, + args: args{ + term: "lorem", + roomIndex: []int{0}, + limit: 20, + }, + }, + { + name: "Can search for one result in one room", + wantCount: 1, + args: args{ + term: "lorem", + roomIndex: []int{16}, + limit: 20, + }, + }, + { + name: "Can search for many results in multiple rooms", + wantCount: 17, + args: args{ + term: "lorem", + roomIndex: []int{0, 16}, + limit: 20, + }, + }, + { + name: "Can search for many results in all rooms, reversed", + wantCount: 30, + args: args{ + term: "lorem", + limit: 30, + orderByStreamPos: true, + }, + }, + { + name: "Can search for specific search room name", + wantCount: 1, + args: args{ + term: "testing", + roomIndex: []int{}, + limit: 20, + keys: []string{"content.name"}, + }, + }, + { + name: "Can search for specific search room topic", + wantCount: 1, + args: args{ + term: "fulltext", + roomIndex: []int{}, + limit: 20, + keys: []string{"content.topic"}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + f := mustOpenIndex(t, "") + eventIDs, roomIDs := mustAddTestData(t, f, 0) + var searchRooms []string + for _, x := range tt.args.roomIndex { + searchRooms = append(searchRooms, roomIDs[x]) + } + t.Logf("searching in rooms: %v - %v\n", searchRooms, tt.args.keys) + + got, err := f.Search(tt.args.term, searchRooms, tt.args.keys, tt.args.limit, tt.args.from, tt.args.orderByStreamPos) + if (err != nil) != tt.wantErr { + t.Errorf("Search() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(len(got.Hits), tt.wantCount) { + t.Errorf("Search() got = %v, want %v", len(got.Hits), tt.wantCount) + } + if tt.args.orderByStreamPos { + if got.Hits[0].ID != eventIDs[29] { + t.Fatalf("expected ID %s, got %s", eventIDs[29], got.Hits[0].ID) + } + } + }) + } +} diff --git a/internal/fulltext/bleve_wasm.go b/internal/fulltext/bleve_wasm.go new file mode 100644 index 00000000..a69a8926 --- /dev/null +++ b/internal/fulltext/bleve_wasm.go @@ -0,0 +1,65 @@ +// Copyright 2022 The Matrix.org Foundation C.I.C. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fulltext + +import ( + "github.com/matrix-org/dendrite/setup/config" + "time" +) + +type Search struct{} +type IndexElement struct { + EventID string + RoomID string + Content string + ContentType string + StreamPosition int64 +} + +type SearchResult struct { + Status interface{} `json:"status"` + Request *interface{} `json:"request"` + Hits []interface{} `json:"hits"` + Total uint64 `json:"total_hits"` + MaxScore float64 `json:"max_score"` + Took time.Duration `json:"took"` + Facets interface{} `json:"facets"` +} + +func (i *IndexElement) SetContentType(v string) {} + +func New(cfg config.Fulltext) (fts *Search, err error) { + return &Search{}, nil +} + +func (f *Search) Close() error { + return nil +} + +func (f *Search) Index(e IndexElement) error { + return nil +} + +func (f *Search) BatchIndex(elements []IndexElement) error { + return nil +} + +func (f *Search) Delete(eventID string) error { + return nil +} + +func (f *Search) Search(term string, roomIDs, keys []string, limit, from int, orderByStreamPos bool) (SearchResult, error) { + return SearchResult{}, nil +} |