matchtree.go at f2577c25dc61f42522172d128c01b3cba7ccc373 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at f2577c25dc61f42522172d128c01b3cba7ccc373 34 kB View raw
Keegan Carruthers-Smith matchtree: do not depend on String for symbol all optimization (#746) 2y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we may find something interesting.
  33	//
  34	// This is like a "peek" and shouldn't mutate state. prepare is what should
  35	// change state.
  36	nextDoc() uint32
  37
  38	// clears any per-document state of the docIterator, and
  39	// prepares for evaluating the given doc. The argument is
  40	// strictly increasing over time.
  41	prepare(nextDoc uint32)
  42}
  43
  44// costs are passed in increasing order to matchTree.matches until they do not
  45// return matchesRequiresHigherCost.
  46const (
  47	costConst   = 0
  48	costMemory  = 1
  49	costContent = 2
  50	costRegexp  = 3
  51)
  52
  53const (
  54	costMin = costConst
  55	costMax = costRegexp
  56)
  57
  58// matchesState is an enum for the state of a matchTree after a call to
  59// matchTree.matches.
  60type matchesState uint8
  61
  62const (
  63	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  64	// a search yet since the cost value is not high enough.
  65	matchesRequiresHigherCost matchesState = iota
  66
  67	// matchesFound is returned when matchTree.matches has done a search and
  68	// found one or more matches.
  69	matchesFound
  70
  71	// matchesNone is returned when matchTree.matches has done a search and
  72	// found nothing.
  73	matchesNone
  74)
  75
  76// matchesStatePred is a helper which returns matchesFound if b is true
  77// otherwise returns matchesNone.
  78func matchesStatePred(b bool) matchesState {
  79	if b {
  80		return matchesFound
  81	}
  82	return matchesNone
  83}
  84
  85// matchesStateForSlice is a helper which returns matchesFound if v is
  86// non-empty otherwise returns matchesNone.
  87func matchesStateForSlice[T any](v []T) matchesState {
  88	return matchesStatePred(len(v) > 0)
  89}
  90
  91// An expression tree coupled with matches. The matchtree has two
  92// functions:
  93//
  94// * it implements boolean combinations (and, or, not)
  95//
  96// * it implements shortcuts, where we skip documents (for example: if
  97// there are no trigram matches, we can be sure there are no substring
  98// matches). The matchtree iterates over the documents as they are
  99// ordered in the shard.
 100//
 101// The general process for a given (shard, query) is:
 102//
 103// - construct matchTree for the query
 104//
 105// - find all different leaf matchTrees (substring, regexp, etc.)
 106//
 107// in a loop:
 108//
 109//   - find next doc to process using nextDoc
 110//
 111//   - evaluate atoms (leaf expressions that match text)
 112//
 113//   - evaluate the tree using matches(), storing the result in map.
 114//
 115//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 116//     for the document, collect all text matches by looking at leaf
 117//     matchTrees.
 118type matchTree interface {
 119	docIterator
 120
 121	// matches if cost is high enough. See documentation for matchesState's
 122	// values.
 123	//
 124	// Note: Do not call this directly, rather use evalMatchTree which uses
 125	// known to cache responses once the state transitions away from
 126	// matchesRequiresHigherCost.
 127	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 128}
 129
 130// docMatchTree iterates over documents for which predicate(docID) returns true.
 131type docMatchTree struct {
 132	// the number of documents in a shard.
 133	numDocs uint32
 134
 135	predicate func(docID uint32) bool
 136
 137	// provides additional information about the reason why the docMatchTree was
 138	// created.
 139	reason string
 140
 141	// mutable
 142	firstDone bool
 143	docID     uint32
 144}
 145
 146type bruteForceMatchTree struct {
 147	// mutable
 148	firstDone bool
 149	docID     uint32
 150}
 151
 152type andLineMatchTree struct {
 153	andMatchTree
 154}
 155
 156type andMatchTree struct {
 157	children []matchTree
 158}
 159
 160type orMatchTree struct {
 161	children []matchTree
 162}
 163
 164type notMatchTree struct {
 165	child matchTree
 166}
 167
 168// Returns only the filename of child matches.
 169type fileNameMatchTree struct {
 170	child matchTree
 171}
 172
 173type boostMatchTree struct {
 174	child matchTree
 175	boost float64
 176}
 177
 178// Don't visit this subtree for collecting matches.
 179type noVisitMatchTree struct {
 180	matchTree
 181}
 182
 183type regexpMatchTree struct {
 184	regexp *regexp.Regexp
 185
 186	// origRegexp is the original parsed regexp from the query structure. It
 187	// does not include mutations such as case sensitivity.
 188	origRegexp *syntax.Regexp
 189
 190	fileName bool
 191
 192	// mutable
 193	reEvaluated bool
 194	found       []*candidateMatch
 195
 196	// nextDoc, prepare.
 197	bruteForceMatchTree
 198}
 199
 200func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 201	prefix := ""
 202	if !s.CaseSensitive {
 203		prefix = "(?i)"
 204	}
 205
 206	return &regexpMatchTree{
 207		regexp:     regexp.MustCompile(prefix + s.Regexp.String()),
 208		origRegexp: s.Regexp,
 209		fileName:   s.FileName,
 210	}
 211}
 212
 213// \bLITERAL\b
 214type wordMatchTree struct {
 215	word string
 216
 217	fileName bool
 218
 219	// mutable
 220	evaluated bool
 221	found     []*candidateMatch
 222
 223	// nextDoc, prepare.
 224	bruteForceMatchTree
 225}
 226
 227type substrMatchTree struct {
 228	matchIterator
 229
 230	query         *query.Substring
 231	caseSensitive bool
 232	fileName      bool
 233
 234	// mutable
 235	current       []*candidateMatch
 236	contEvaluated bool
 237}
 238
 239type branchQueryMatchTree struct {
 240	fileMasks []uint64
 241	masks     []uint64
 242	repos     []uint16
 243
 244	// mutable
 245	firstDone bool
 246	docID     uint32
 247}
 248
 249func (t *branchQueryMatchTree) branchMask() uint64 {
 250	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 251}
 252
 253type symbolRegexpMatchTree struct {
 254	matchTree
 255	regexp *regexp.Regexp
 256	all    bool // skips regex match if .*
 257
 258	reEvaluated bool
 259	found       []*candidateMatch
 260}
 261
 262func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 263	t.reEvaluated = false
 264	t.found = t.found[:0]
 265	t.matchTree.prepare(doc)
 266}
 267
 268func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 269	if t.reEvaluated {
 270		return matchesStateForSlice(t.found)
 271	}
 272
 273	if cost < costRegexp {
 274		return matchesRequiresHigherCost
 275	}
 276
 277	sections := cp.docSections()
 278	content := cp.data(false)
 279
 280	found := t.found[:0]
 281	for i, sec := range sections {
 282		var idx []int
 283		if t.all {
 284			idx = []int{0, int(sec.End - sec.Start)}
 285		} else {
 286			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 287			if idx == nil {
 288				continue
 289			}
 290		}
 291
 292		cm := &candidateMatch{
 293			byteOffset:  sec.Start + uint32(idx[0]),
 294			byteMatchSz: uint32(idx[1] - idx[0]),
 295			symbol:      true,
 296			symbolIdx:   uint32(i),
 297		}
 298		found = append(found, cm)
 299	}
 300	t.found = found
 301	t.reEvaluated = true
 302
 303	return matchesStateForSlice(t.found)
 304}
 305
 306type symbolSubstrMatchTree struct {
 307	*substrMatchTree
 308
 309	patternSize   uint32
 310	fileEndRunes  []uint32
 311	fileEndSymbol []uint32
 312
 313	doc      uint32
 314	sections []DocumentSection
 315
 316	secID uint32
 317}
 318
 319func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 320	t.substrMatchTree.prepare(doc)
 321	t.doc = doc
 322
 323	var fileStart uint32
 324	if doc > 0 {
 325		fileStart = t.fileEndRunes[doc-1]
 326	}
 327
 328	var sections []DocumentSection
 329	if len(t.sections) > 0 {
 330		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 331		if most == uint32(len(t.sections)) {
 332			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 333		} else {
 334			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 335				t.secID++
 336			}
 337
 338			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 339			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 340				symbolEnd++
 341			}
 342
 343			sections = t.sections[t.secID:symbolEnd]
 344		}
 345	}
 346
 347	secIdx := 0
 348	trimmed := t.current[:0]
 349	for len(sections) > secIdx && len(t.current) > 0 {
 350		start := fileStart + t.current[0].runeOffset
 351		end := start + t.patternSize
 352		if start >= sections[secIdx].End {
 353			secIdx++
 354			continue
 355		}
 356
 357		if start < sections[secIdx].Start {
 358			t.current = t.current[1:]
 359			continue
 360		}
 361
 362		if end <= sections[secIdx].End {
 363			t.current[0].symbol = true
 364			t.current[0].symbolIdx = uint32(secIdx)
 365			trimmed = append(trimmed, t.current[0])
 366		}
 367
 368		t.current = t.current[1:]
 369	}
 370	t.current = trimmed
 371}
 372
 373// all prepare methods
 374
 375func (t *bruteForceMatchTree) prepare(doc uint32) {
 376	t.docID = doc
 377	t.firstDone = true
 378}
 379
 380func (t *docMatchTree) prepare(doc uint32) {
 381	t.docID = doc
 382	t.firstDone = true
 383}
 384
 385func (t *andMatchTree) prepare(doc uint32) {
 386	for _, c := range t.children {
 387		c.prepare(doc)
 388	}
 389}
 390
 391func (t *regexpMatchTree) prepare(doc uint32) {
 392	t.found = t.found[:0]
 393	t.reEvaluated = false
 394	t.bruteForceMatchTree.prepare(doc)
 395}
 396
 397func (t *wordMatchTree) prepare(doc uint32) {
 398	t.found = t.found[:0]
 399	t.evaluated = false
 400	t.bruteForceMatchTree.prepare(doc)
 401}
 402
 403func (t *orMatchTree) prepare(doc uint32) {
 404	for _, c := range t.children {
 405		c.prepare(doc)
 406	}
 407}
 408
 409func (t *notMatchTree) prepare(doc uint32) {
 410	t.child.prepare(doc)
 411}
 412
 413func (t *fileNameMatchTree) prepare(doc uint32) {
 414	t.child.prepare(doc)
 415}
 416
 417func (t *boostMatchTree) prepare(doc uint32) {
 418	t.child.prepare(doc)
 419}
 420
 421func (t *substrMatchTree) prepare(nextDoc uint32) {
 422	t.matchIterator.prepare(nextDoc)
 423	t.current = t.matchIterator.candidates()
 424	t.contEvaluated = false
 425}
 426
 427func (t *branchQueryMatchTree) prepare(doc uint32) {
 428	t.firstDone = true
 429	t.docID = doc
 430}
 431
 432// nextDoc
 433
 434func (t *docMatchTree) nextDoc() uint32 {
 435	var start uint32
 436	if t.firstDone {
 437		start = t.docID + 1
 438	}
 439	for i := start; i < t.numDocs; i++ {
 440		if t.predicate(i) {
 441			return i
 442		}
 443	}
 444	return maxUInt32
 445}
 446
 447func (t *bruteForceMatchTree) nextDoc() uint32 {
 448	if !t.firstDone {
 449		return 0
 450	}
 451	return t.docID + 1
 452}
 453
 454func (t *andMatchTree) nextDoc() uint32 {
 455	var max uint32
 456	for _, c := range t.children {
 457		m := c.nextDoc()
 458		if m > max {
 459			max = m
 460		}
 461	}
 462	return max
 463}
 464
 465func (t *orMatchTree) nextDoc() uint32 {
 466	min := uint32(maxUInt32)
 467	for _, c := range t.children {
 468		m := c.nextDoc()
 469		if m < min {
 470			min = m
 471		}
 472	}
 473	return min
 474}
 475
 476func (t *notMatchTree) nextDoc() uint32 {
 477	return 0
 478}
 479
 480func (t *fileNameMatchTree) nextDoc() uint32 {
 481	return t.child.nextDoc()
 482}
 483
 484func (t *boostMatchTree) nextDoc() uint32 {
 485	return t.child.nextDoc()
 486}
 487
 488func (t *branchQueryMatchTree) nextDoc() uint32 {
 489	var start uint32
 490	if t.firstDone {
 491		start = t.docID + 1
 492	}
 493
 494	for i := start; i < uint32(len(t.fileMasks)); i++ {
 495		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 496			return i
 497		}
 498	}
 499	return maxUInt32
 500}
 501
 502// all String methods
 503
 504func (t *bruteForceMatchTree) String() string {
 505	return "all"
 506}
 507
 508func (t *docMatchTree) String() string {
 509	return fmt.Sprintf("doc(%s)", t.reason)
 510}
 511
 512func (t *andMatchTree) String() string {
 513	return fmt.Sprintf("and%v", t.children)
 514}
 515
 516func (t *regexpMatchTree) String() string {
 517	f := ""
 518	if t.fileName {
 519		f = "f"
 520	}
 521	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 522}
 523
 524func (t *wordMatchTree) String() string {
 525	f := ""
 526	if t.fileName {
 527		f = "f"
 528	}
 529	return fmt.Sprintf("%sword(%s)", f, t.word)
 530}
 531
 532func (t *orMatchTree) String() string {
 533	return fmt.Sprintf("or%v", t.children)
 534}
 535
 536func (t *notMatchTree) String() string {
 537	return fmt.Sprintf("not(%v)", t.child)
 538}
 539
 540func (t *noVisitMatchTree) String() string {
 541	return fmt.Sprintf("novisit(%v)", t.matchTree)
 542}
 543
 544func (t *fileNameMatchTree) String() string {
 545	return fmt.Sprintf("f(%v)", t.child)
 546}
 547
 548func (t *boostMatchTree) String() string {
 549	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 550}
 551
 552func (t *substrMatchTree) String() string {
 553	f := ""
 554	if t.fileName {
 555		f = "f"
 556	}
 557
 558	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 559}
 560
 561func (t *branchQueryMatchTree) String() string {
 562	return fmt.Sprintf("branch(%x)", t.masks)
 563}
 564
 565func (t *symbolSubstrMatchTree) String() string {
 566	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 567}
 568
 569func (t *symbolRegexpMatchTree) String() string {
 570	return fmt.Sprintf("symbol(%v)", t.matchTree)
 571}
 572
 573// visitMatches visits all atoms in matchTree. Note: This visits
 574// noVisitMatchTree. For collecting matches use visitMatches.
 575func visitMatchTree(t matchTree, f func(matchTree)) {
 576	switch s := t.(type) {
 577	case *andMatchTree:
 578		for _, ch := range s.children {
 579			visitMatchTree(ch, f)
 580		}
 581	case *orMatchTree:
 582		for _, ch := range s.children {
 583			visitMatchTree(ch, f)
 584		}
 585	case *andLineMatchTree:
 586		visitMatchTree(&s.andMatchTree, f)
 587	case *noVisitMatchTree:
 588		visitMatchTree(s.matchTree, f)
 589	case *notMatchTree:
 590		visitMatchTree(s.child, f)
 591	case *fileNameMatchTree:
 592		visitMatchTree(s.child, f)
 593	case *boostMatchTree:
 594		visitMatchTree(s.child, f)
 595	case *symbolSubstrMatchTree:
 596		visitMatchTree(s.substrMatchTree, f)
 597	case *symbolRegexpMatchTree:
 598		visitMatchTree(s.matchTree, f)
 599	default:
 600		f(t)
 601	}
 602}
 603
 604// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 605// function defined.
 606func updateMatchTreeStats(mt matchTree, stats *Stats) {
 607	visitMatchTree(mt, func(mt matchTree) {
 608		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
 609			atom.updateStats(stats)
 610		}
 611	})
 612}
 613
 614func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 615	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 616		f(mt)
 617	})
 618}
 619
 620// visitMatches visits all atoms which can contribute matches. Note: This
 621// skips noVisitMatchTree.
 622func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 623	switch s := t.(type) {
 624	case *andMatchTree:
 625		for _, ch := range s.children {
 626			if known[ch] {
 627				visitMatches(ch, known, weight, f)
 628			}
 629		}
 630	case *andLineMatchTree:
 631		visitMatches(&s.andMatchTree, known, weight, f)
 632	case *orMatchTree:
 633		for _, ch := range s.children {
 634			if known[ch] {
 635				visitMatches(ch, known, weight, f)
 636			}
 637		}
 638	case *boostMatchTree:
 639		visitMatches(s.child, known, weight*s.boost, f)
 640	case *symbolSubstrMatchTree:
 641		visitMatches(s.substrMatchTree, known, weight, f)
 642	case *notMatchTree:
 643	case *noVisitMatchTree:
 644		// don't collect into negative trees.
 645	case *fileNameMatchTree:
 646		// We will just gather the filename if we do not visit this tree.
 647	default:
 648		f(s, weight)
 649	}
 650}
 651
 652// all matches() methods.
 653
 654func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 655	return matchesStatePred(t.predicate(cp.idx))
 656}
 657
 658func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 659	return matchesFound
 660}
 661
 662// andLineMatchTree is a performance optimization of andMatchTree. For content
 663// searches we don't want to run the regex engine if there is no line that
 664// contains matches from all terms.
 665func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 666	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 667		return state
 668	}
 669
 670	// Invariant: all children have matches. If any line contains all of them we
 671	// can return MatchesFound.
 672
 673	// find child with fewest candidates
 674	min := maxUInt32
 675	fewestChildren := 0
 676	for ix, child := range t.children {
 677		v, ok := child.(*substrMatchTree)
 678		// make sure we are running a content search and that all candidates are a
 679		// substrMatchTree
 680		if !ok || v.fileName {
 681			return matchesFound
 682		}
 683		if len(v.current) < min {
 684			min = len(v.current)
 685			fewestChildren = ix
 686		}
 687	}
 688
 689	type lineRange struct {
 690		start int
 691		end   int
 692	}
 693	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 694	prev := -1
 695	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 696		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 697		if line == prev {
 698			continue
 699		}
 700		prev = line
 701		lines = append(lines, lineRange{byteStart, byteEnd})
 702	}
 703
 704	// children keeps track of the children's candidates we have already seen.
 705	children := make([][]*candidateMatch, 0, len(t.children)-1)
 706	for j, child := range t.children {
 707		if j == fewestChildren {
 708			continue
 709		}
 710		children = append(children, child.(*substrMatchTree).current)
 711	}
 712
 713nextLine:
 714	for i := 0; i < len(lines); i++ {
 715		hits := 1
 716	nextChild:
 717		for j := range children {
 718		nextCandidate:
 719			for len(children[j]) > 0 {
 720				candidate := children[j][0]
 721				bo := int(cp.findOffset(false, candidate.runeOffset))
 722				if bo < lines[i].start {
 723					children[j] = children[j][1:]
 724					continue nextCandidate
 725				}
 726				if bo <= lines[i].end {
 727					hits++
 728					continue nextChild
 729				}
 730				// move the `lines` iterator forward until bo <= line.end
 731				for i < len(lines) && bo > lines[i].end {
 732					i++
 733				}
 734				i--
 735				continue nextLine
 736			}
 737		}
 738		// return early once we found any line that contains matches from all children
 739		if hits == len(t.children) {
 740			return matchesFound
 741		}
 742	}
 743	return matchesNone
 744}
 745
 746func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 747	// We have found matches unless a child needs to do more work or it hasn't
 748	// found matches.
 749	state := matchesFound
 750
 751	for _, ch := range t.children {
 752		switch evalMatchTree(cp, cost, known, ch) {
 753		case matchesRequiresHigherCost:
 754			// keep evaluating other children incase we come across matchesNone
 755			state = matchesRequiresHigherCost
 756		case matchesFound:
 757			// will return this if every child has this value
 758		case matchesNone:
 759			return matchesNone
 760		}
 761	}
 762
 763	return state
 764}
 765
 766func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 767	// we could short-circuit, but we want to use the other possibilities as a
 768	// ranking signal. So we always return the most conservative state.
 769	state := matchesNone
 770	for _, ch := range t.children {
 771		switch evalMatchTree(cp, cost, known, ch) {
 772		case matchesRequiresHigherCost:
 773			state = matchesRequiresHigherCost
 774		case matchesFound:
 775			if state != matchesRequiresHigherCost {
 776				state = matchesFound
 777			}
 778		case matchesNone:
 779			// noop
 780		}
 781	}
 782	return state
 783}
 784
 785func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 786	return matchesStatePred(t.branchMask() != 0)
 787}
 788
 789func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 790	if t.reEvaluated {
 791		return matchesStateForSlice(t.found)
 792	}
 793
 794	if cost < costRegexp {
 795		return matchesRequiresHigherCost
 796	}
 797
 798	cp.stats.RegexpsConsidered++
 799	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 800	found := t.found[:0]
 801	for _, idx := range idxs {
 802		cm := &candidateMatch{
 803			byteOffset:  uint32(idx[0]),
 804			byteMatchSz: uint32(idx[1] - idx[0]),
 805			fileName:    t.fileName,
 806		}
 807
 808		found = append(found, cm)
 809	}
 810	t.found = found
 811	t.reEvaluated = true
 812
 813	return matchesStateForSlice(t.found)
 814}
 815
 816func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 817	if t.evaluated {
 818		return matchesStateForSlice(t.found)
 819	}
 820
 821	if cost < costRegexp {
 822		return matchesRequiresHigherCost
 823	}
 824
 825	data := cp.data(t.fileName)
 826	offset := 0
 827	found := t.found[:0]
 828	for {
 829		idx := bytes.Index(data[offset:], []byte(t.word))
 830		if idx < 0 {
 831			break
 832		}
 833
 834		relStartOffset := offset + idx
 835		relEndOffset := relStartOffset + len(t.word)
 836
 837		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 838		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 839		if startBoundary && endBoundary {
 840			found = append(found, &candidateMatch{
 841				byteOffset:  uint32(offset + idx),
 842				byteMatchSz: uint32(len(t.word)),
 843				fileName:    t.fileName,
 844			})
 845		}
 846		offset += idx + len(t.word)
 847	}
 848
 849	t.found = found
 850	t.evaluated = true
 851
 852	return matchesStateForSlice(t.found)
 853}
 854
 855// breakMatchesOnNewlines returns matches resulting from breaking each element
 856// of cms on newlines within text.
 857func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 858	var lineCMs []*candidateMatch
 859	for _, cm := range cms {
 860		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 861	}
 862	return lineCMs
 863}
 864
 865// breakOnNewlines returns matches resulting from breaking cm on newlines
 866// within text.
 867func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 868	var cms []*candidateMatch
 869	addMe := &candidateMatch{}
 870	*addMe = *cm
 871	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 872		if text[i] == '\n' {
 873			addMe.byteMatchSz = i - addMe.byteOffset
 874			if addMe.byteMatchSz != 0 {
 875				cms = append(cms, addMe)
 876			}
 877
 878			addMe = &candidateMatch{}
 879			*addMe = *cm
 880			addMe.byteOffset = i + 1
 881		}
 882	}
 883	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 884	if addMe.byteMatchSz != 0 {
 885		cms = append(cms, addMe)
 886	}
 887	return cms
 888}
 889
 890// evalMatchTree should be called instead of directly calling
 891// matchTree.matches. It cache known values for future evaluation at higher
 892// costs.
 893func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 894	if v, ok := known[mt]; ok {
 895		return matchesStatePred(v)
 896	}
 897
 898	ms := mt.matches(cp, cost, known)
 899	if ms != matchesRequiresHigherCost {
 900		known[mt] = ms == matchesFound
 901	}
 902
 903	return ms
 904}
 905
 906func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 907	switch evalMatchTree(cp, cost, known, t.child) {
 908	case matchesRequiresHigherCost:
 909		return matchesRequiresHigherCost
 910	case matchesFound:
 911		return matchesNone
 912	case matchesNone:
 913		return matchesFound
 914	default:
 915		panic("unreachable")
 916	}
 917}
 918
 919func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 920	return evalMatchTree(cp, cost, known, t.child)
 921}
 922
 923func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 924	return evalMatchTree(cp, cost, known, t.child)
 925}
 926
 927func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 928	if t.contEvaluated {
 929		return matchesStateForSlice(t.current)
 930	}
 931
 932	if len(t.current) == 0 {
 933		return matchesNone
 934	}
 935
 936	if t.fileName && cost < costMemory {
 937		return matchesRequiresHigherCost
 938	}
 939
 940	if !t.fileName && cost < costContent {
 941		return matchesRequiresHigherCost
 942	}
 943
 944	pruned := t.current[:0]
 945	for _, m := range t.current {
 946		if m.byteOffset == 0 && m.runeOffset > 0 {
 947			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 948		}
 949		if m.matchContent(cp.data(m.fileName)) {
 950			pruned = append(pruned, m)
 951		}
 952	}
 953	t.current = pruned
 954	t.contEvaluated = true
 955
 956	return matchesStateForSlice(t.current)
 957}
 958
 959type matchTreeOpt struct {
 960	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 961	// This was added since we do not support wordMatchTree with symbol search.
 962	DisableWordMatchOptimization bool
 963}
 964
 965func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 966	if q == nil {
 967		return nil, fmt.Errorf("got nil (sub)query")
 968	}
 969	switch s := q.(type) {
 970	case *query.Regexp:
 971		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 972		// superset of the regexp. If the returned matchTree is equivalent to the
 973		// original regexp, it returns true. An equivalent matchTree has the same
 974		// behaviour as the original regexp and can be used instead.
 975		//
 976		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 977		if err != nil {
 978			return nil, err
 979		}
 980		// if the query can be used in place of the regexp
 981		// return the subtree
 982		if isEq {
 983			return subMT, nil
 984		}
 985
 986		var tr matchTree
 987		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 988			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 989			// provide something faster.
 990			tr = wmt
 991		} else {
 992			tr = newRegexpMatchTree(s)
 993		}
 994
 995		return &andMatchTree{
 996			children: []matchTree{
 997				tr, &noVisitMatchTree{subMT},
 998			},
 999		}, nil
1000	case *query.And:
1001		var r []matchTree
1002		for _, ch := range s.Children {
1003			ct, err := d.newMatchTree(ch, opt)
1004			if err != nil {
1005				return nil, err
1006			}
1007			r = append(r, ct)
1008		}
1009		return &andMatchTree{r}, nil
1010	case *query.Or:
1011		var r []matchTree
1012		for _, ch := range s.Children {
1013			ct, err := d.newMatchTree(ch, opt)
1014			if err != nil {
1015				return nil, err
1016			}
1017			r = append(r, ct)
1018		}
1019		return &orMatchTree{r}, nil
1020	case *query.Not:
1021		ct, err := d.newMatchTree(s.Child, opt)
1022		return &notMatchTree{
1023			child: ct,
1024		}, err
1025
1026	case *query.Type:
1027		if s.Type != query.TypeFileName {
1028			break
1029		}
1030
1031		ct, err := d.newMatchTree(s.Child, opt)
1032		if err != nil {
1033			return nil, err
1034		}
1035
1036		return &fileNameMatchTree{
1037			child: ct,
1038		}, nil
1039
1040	case *query.Boost:
1041		ct, err := d.newMatchTree(s.Child, opt)
1042		if err != nil {
1043			return nil, err
1044		}
1045
1046		return &boostMatchTree{
1047			child: ct,
1048			boost: s.Boost,
1049		}, nil
1050
1051	case *query.Substring:
1052		return d.newSubstringMatchTree(s)
1053
1054	case *query.Branch:
1055		masks := make([]uint64, 0, len(d.repoMetaData))
1056		if s.Pattern == "HEAD" {
1057			for i := 0; i < len(d.repoMetaData); i++ {
1058				masks = append(masks, 1)
1059			}
1060		} else {
1061			for _, branchIDs := range d.branchIDs {
1062				mask := uint64(0)
1063				for nm, m := range branchIDs {
1064					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1065						mask |= uint64(m)
1066					}
1067				}
1068				masks = append(masks, mask)
1069			}
1070		}
1071		return &branchQueryMatchTree{
1072			masks:     masks,
1073			fileMasks: d.fileBranchMasks,
1074			repos:     d.repos,
1075		}, nil
1076	case *query.Const:
1077		if s.Value {
1078			return &bruteForceMatchTree{}, nil
1079		} else {
1080			return &noMatchTree{Why: "const"}, nil
1081		}
1082	case *query.Language:
1083		code, ok := d.metaData.LanguageMap[s.Language]
1084		if !ok {
1085			return &noMatchTree{Why: "lang"}, nil
1086		}
1087		return &docMatchTree{
1088			reason:  "language",
1089			numDocs: d.numDocs(),
1090			predicate: func(docID uint32) bool {
1091				return d.getLanguage(docID) == code
1092			},
1093		}, nil
1094
1095	case *query.Symbol:
1096		// Disable WordMatchTree since we don't support it in symbols yet.
1097		optCopy := opt
1098		optCopy.DisableWordMatchOptimization = true
1099
1100		subMT, err := d.newMatchTree(s.Expr, optCopy)
1101		if err != nil {
1102			return nil, err
1103		}
1104
1105		if substr, ok := subMT.(*substrMatchTree); ok {
1106			return &symbolSubstrMatchTree{
1107				substrMatchTree: substr,
1108				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1109				fileEndRunes:    d.fileEndRunes,
1110				fileEndSymbol:   d.fileEndSymbol,
1111				sections:        d.runeDocSections,
1112			}, nil
1113		}
1114
1115		var regexpMT *regexpMatchTree
1116		visitMatchTree(subMT, func(mt matchTree) {
1117			if t, ok := mt.(*regexpMatchTree); ok {
1118				regexpMT = t
1119			}
1120		})
1121		if regexpMT == nil {
1122			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1123		}
1124
1125		return &symbolRegexpMatchTree{
1126			regexp:    regexpMT.regexp,
1127			all:       isRegexpAll(regexpMT.origRegexp),
1128			matchTree: subMT,
1129		}, nil
1130
1131	case *query.FileNameSet:
1132		return &docMatchTree{
1133			reason:  "FileNameSet",
1134			numDocs: d.numDocs(),
1135			predicate: func(docID uint32) bool {
1136				fileName := d.fileName(docID)
1137				_, ok := s.Set[string(fileName)]
1138				return ok
1139			},
1140		}, nil
1141
1142	case *query.BranchesRepos:
1143		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1144		for repoIdx := range d.repoMetaData {
1145			var mask uint64
1146			for _, br := range s.List {
1147				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1148					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1149				}
1150			}
1151			reposBranchesWant[repoIdx] = mask
1152		}
1153		return &docMatchTree{
1154			reason:  "BranchesRepos",
1155			numDocs: d.numDocs(),
1156			predicate: func(docID uint32) bool {
1157				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1158			},
1159		}, nil
1160
1161	case *query.RepoSet:
1162		reposWant := make([]bool, len(d.repoMetaData))
1163		for repoIdx, r := range d.repoMetaData {
1164			if _, ok := s.Set[r.Name]; ok {
1165				reposWant[repoIdx] = true
1166			}
1167		}
1168		return &docMatchTree{
1169			reason:  "RepoSet",
1170			numDocs: d.numDocs(),
1171			predicate: func(docID uint32) bool {
1172				return reposWant[d.repos[docID]]
1173			},
1174		}, nil
1175
1176	case *query.RepoIDs:
1177		reposWant := make([]bool, len(d.repoMetaData))
1178		for repoIdx, r := range d.repoMetaData {
1179			if s.Repos.Contains(r.ID) {
1180				reposWant[repoIdx] = true
1181			}
1182		}
1183		return &docMatchTree{
1184			reason:  "RepoIDs",
1185			numDocs: d.numDocs(),
1186			predicate: func(docID uint32) bool {
1187				return reposWant[d.repos[docID]]
1188			},
1189		}, nil
1190
1191	case *query.Repo:
1192		reposWant := make([]bool, len(d.repoMetaData))
1193		for repoIdx, r := range d.repoMetaData {
1194			if s.Regexp.MatchString(r.Name) {
1195				reposWant[repoIdx] = true
1196			}
1197		}
1198		return &docMatchTree{
1199			reason:  "Repo",
1200			numDocs: d.numDocs(),
1201			predicate: func(docID uint32) bool {
1202				return reposWant[d.repos[docID]]
1203			},
1204		}, nil
1205
1206	case *query.RepoRegexp:
1207		reposWant := make([]bool, len(d.repoMetaData))
1208		for repoIdx, r := range d.repoMetaData {
1209			if s.Regexp.MatchString(r.Name) {
1210				reposWant[repoIdx] = true
1211			}
1212		}
1213		return &docMatchTree{
1214			reason:  "RepoRegexp",
1215			numDocs: d.numDocs(),
1216			predicate: func(docID uint32) bool {
1217				return reposWant[d.repos[docID]]
1218			},
1219		}, nil
1220
1221	case query.RawConfig:
1222		return &docMatchTree{
1223			reason:  s.String(),
1224			numDocs: d.numDocs(),
1225			predicate: func(docID uint32) bool {
1226				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1227			},
1228		}, nil
1229	}
1230	log.Panicf("type %T", q)
1231	return nil, nil
1232}
1233
1234func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1235	st := &substrMatchTree{
1236		query:         s,
1237		caseSensitive: s.CaseSensitive,
1238		fileName:      s.FileName,
1239	}
1240
1241	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1242		return newRegexpMatchTree(&query.Regexp{
1243			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1244			FileName:      s.FileName,
1245			Content:       s.Content,
1246			CaseSensitive: s.CaseSensitive,
1247		}), nil
1248	}
1249
1250	result, err := d.iterateNgrams(s)
1251	if err != nil {
1252		return nil, err
1253	}
1254	st.matchIterator = result
1255	return st, nil
1256}
1257
1258func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1259	if opt.DisableWordMatchOptimization {
1260		return nil, false
1261	}
1262	// Needs to be case sensitive
1263	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1264		return nil, false
1265	}
1266	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1267	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1268		return nil, false
1269	}
1270	sub := q.Regexp.Sub
1271	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1272		return nil, false
1273	}
1274
1275	return &wordMatchTree{
1276		word:     string(sub[1].Rune),
1277		fileName: q.FileName,
1278	}, true
1279}
1280
1281// pruneMatchTree removes impossible branches from the matchTree, as indicated
1282// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1283func pruneMatchTree(mt matchTree) (matchTree, error) {
1284	var err error
1285	switch mt := mt.(type) {
1286	// leaf nodes that we test with the filter:
1287	case *substrMatchTree:
1288		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1289			if _, ok := res.matchIterator.(*noMatchTree); ok {
1290				return nil, nil
1291			}
1292		}
1293	// recursive tree structures:
1294	case *andMatchTree:
1295		// Any branch of an and becoming impossible means the entire clause
1296		// is impossible. Otherwise, just handle rewrites.
1297		for i, child := range mt.children {
1298			newChild, err := pruneMatchTree(child)
1299			if err != nil {
1300				return nil, err
1301			}
1302			if newChild == nil {
1303				return nil, nil
1304			}
1305			mt.children[i] = newChild
1306		}
1307	case *orMatchTree:
1308		// *All* branches of an OR becoming impossible means the entire clause
1309		// is impossible. Otherwise, drop impossible subclauses and handle
1310		// rewrites, including simplifying to a singular resulting child branch.
1311		n := 0
1312		for _, child := range mt.children {
1313			newChild, err := pruneMatchTree(child)
1314			if err != nil {
1315				return nil, err
1316			}
1317			if newChild != nil {
1318				mt.children[n] = newChild
1319				n++
1320			}
1321		}
1322		mt.children = mt.children[:n]
1323		if len(mt.children) == 1 {
1324			return mt.children[0], nil
1325		} else if len(mt.children) == 0 {
1326			return nil, nil
1327		}
1328	case *noVisitMatchTree:
1329		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1330		if err != nil {
1331			return nil, err
1332		}
1333		if mt.matchTree == nil {
1334			return nil, nil
1335		}
1336	case *fileNameMatchTree:
1337		mt.child, err = pruneMatchTree(mt.child)
1338		if err != nil {
1339			return nil, err
1340		}
1341		if mt.child == nil {
1342			return nil, nil
1343		}
1344	case *boostMatchTree:
1345		mt.child, err = pruneMatchTree(mt.child)
1346		if err != nil {
1347			return nil, err
1348		}
1349		if mt.child == nil {
1350			return nil, nil
1351		}
1352	case *andLineMatchTree:
1353		child, err := pruneMatchTree(&mt.andMatchTree)
1354		if err != nil {
1355			return nil, err
1356		}
1357		if child == nil {
1358			return nil, nil
1359		}
1360		if c, ok := child.(*andMatchTree); ok {
1361			mt.andMatchTree = *c
1362		} else {
1363			// the and was simplified to a single clause,
1364			// so the linematch portion is irrelevant.
1365			return mt, nil
1366		}
1367	case *notMatchTree:
1368		mt.child, err = pruneMatchTree(mt.child)
1369		if err != nil {
1370			return nil, err
1371		}
1372		if mt.child == nil {
1373			// not false => true
1374			return &bruteForceMatchTree{}, nil
1375		}
1376	// unhandled:
1377	case *docMatchTree:
1378	case *bruteForceMatchTree:
1379	case *regexpMatchTree:
1380	case *wordMatchTree:
1381	}
1382	return mt, err
1383}
1384
1385// isRegexpAll returns true if the query matches all possible lines.
1386//
1387// Note: it is possible for a funky regex to actually match all but this
1388// returns false. This returns true for normal looking regexes like ".*" or
1389// "(?-s:.*)".
1390func isRegexpAll(r *syntax.Regexp) bool {
1391	// Note: we don't care about any flags since we are looking for .* and we
1392	// don't mind if . matches all or everything but newline.
1393
1394	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1395	for {
1396		// Our main target: .*
1397		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1398			// (?s:.) or (?-s:.)
1399			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1400		}
1401
1402		// Strip away expressions being wrapped in paranthesis
1403		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1404			r = r.Sub[0]
1405			continue
1406		}
1407
1408		return false
1409	}
1410}
Configure Feed

Configure Feed