<?xml version="1.0" encoding="UTF-8" ?>

<schema name="cb-nano-content-node-4-solr-8" version="1.6">

    <!-- technical field for transactions -->
    <field name="_version_" type="plong" indexed="false" stored="false" multiValued="false" docValues="true" />

    <!-- primary ID of the document absolute and resolved path of a file -->
    <field name="path" type="raw_str" indexed="true" stored="false" required="true" multiValued="false" docValues="true" />
    <uniqueKey>path</uniqueKey>

    <!-- document type to filter for synchronization or query -->
    <field name="file" type="bool" uninvertible="false" indexed="true" stored="true" required="true" multiValued="false" docValues="false" />

    <!-- specifically indexed to make filtering by drive root path possible and retrieve drive-relative path -->
    <field name="accessors" type="drive_path" uninvertible="false" indexed="true" stored="true" required="true" multiValued="true" docValues="false" omitTermFreqAndPositions="true" />

    <!-- specifically indexed to make direct children document lookup possible by parent path -->
    <!-- NOTE: required does not interfere with posix-root path edge-case; I guess having a value even though no tokens are indexed from it is enough -->
    <field name="parents" type="parent_path" uninvertible="false" indexed="true" stored="false" required="true" multiValued="true" docValues="false" omitTermFreqAndPositions="true" />

    <!-- indexing staleness metadata -->
    <field name="size" type="plong" indexed="false" stored="true" required="false" multiValued="false" docValues="false" />
    <field name="mtime" type="plong" indexed="false" stored="false" required="false" multiValued="false" docValues="true" />

    <!-- thumbnail data -->
    <field name="img_data" type="binary" indexed="false" stored="true" required="false" multiValued="false" docValues="false" />
    <field name="img_bucket" type="pint" indexed="true" stored="true" required="false" multiValued="false" docValues="false" />

    <!-- to support returning random results -->
<!--    <dynamicField name="random_*" type="random" indexed="true" stored="false" />-->

    <!-- data that can be searched by user -->
    <field name="summary_faba" type="summary_faba" uninvertible="false" indexed="true" stored="true" required="false" multiValued="false" docValues="false" />
    <field name="text_faba" type="text_faba" uninvertible="false" indexed="true" stored="true" required="false" multiValued="false" docValues="false" storeOffsetsWithPositions="true" />

    <!-- type definitions -->
    <fieldType name="bool" class="solr.BoolField"/>
    <fieldType name="pint" class="solr.IntPointField"/>
    <fieldType name="plong" class="solr.LongPointField"/>
    <fieldType name="date" class="solr.DatePointField"/>
    <fieldType name="raw_str" class="solr.StrField" />
    <fieldType name="binary" class="solr.BinaryField" />
    <fieldType name="random" class="solr.RandomSortField" />

    <!-- drive-relative storage path type definitions -->
    <fieldType name="drive_path" class="solr.TextField" positionIncrementGap="10">
      <analyzer type="index">
        <tokenizer class="solr.PathHierarchyTokenizerFactory" />
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.KeywordTokenizerFactory" />
      </analyzer>
      <similarity class="org.apache.lucene.search.similarities.BooleanSimilarity" />
    </fieldType>

    <!-- parent lookup path type definitions -->
    <fieldType name="parent_path" class="solr.TextField" positionIncrementGap="10">
      <analyzer type="index">
        <!-- the eager behaviour will make the group match everything until the last slash in the path -->
        <tokenizer class="solr.PatternTokenizerFactory" pattern="^(.+)/" group="1" />
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.KeywordTokenizerFactory" />
      </analyzer>
      <similarity class="org.apache.lucene.search.similarities.BooleanSimilarity" />
    </fieldType>

    <!-- fallback content type definitions -->
    <!--
    The StandardTokenizer in Solr implements the "Unicode Standard Annex #29" for text segmentation.
    This works wonders with real text, but is not optimal for file names because one of the main separator
    characters is not split by itself: "." (period). Examples:
        "file.txt" -> "file.txt"
        "file. txt" -> "file", "txt"
    Since this system primarily deals with files, we have to improve this behaviour. We should consider
    implementing a custom tokenizer, but until then here we'll use a simple and more broad regex term for
    splitting the input for "summary" fields.
    -->
    <fieldType name="summary_faba" class="solr.TextField" positionIncrementGap="10">
      <analyzer>
        <tokenizer class="solr.PatternTokenizerFactory" pattern="(?U)[_\W]+"/>
        <filter class="solr.ICUFoldingFilterFactory"/>
      </analyzer>
      <similarity class="solr.BM25SimilarityFactory">
        <str name="k1">1.2</str>
        <str name="b">0.95</str>
      </similarity>
    </fieldType>

    <fieldType name="text_faba" class="solr.TextField" positionIncrementGap="10">
      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ICUFoldingFilterFactory"/>
      </analyzer>
      <similarity class="solr.BM25SimilarityFactory">
        <str name="k1">1.2</str>
        <str name="b">0.95</str>
      </similarity>
    </fieldType>

    <copyField source="accessors" dest="parents" />

    <!-- TEMPLATE INSERTION POINT -->

</schema>
