/*
 * Decompiled with CFR 0.152.
 */
package ai.grazie.nlp.encoder;

import ai.grazie.nlp.utils.UtilsKt;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import kotlin.Metadata;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.ranges.IntRange;
import kotlin.ranges.RangesKt;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;

@Metadata(mv={1, 7, 0}, k=1, xi=48, d1={"\u0000\"\n\u0002\u0018\u0002\n\u0002\u0010\u0000\n\u0000\n\u0002\u0010$\n\u0002\u0010\u000e\n\u0002\u0010\b\n\u0002\b\u0004\n\u0002\u0010 \n\u0002\b\u0002\u0018\u00002\u00020\u0001B+\u0012\u0012\u0010\u0002\u001a\u000e\u0012\u0004\u0012\u00020\u0004\u0012\u0004\u0012\u00020\u00050\u0003\u0012\u0006\u0010\u0006\u001a\u00020\u0004\u0012\b\b\u0002\u0010\u0007\u001a\u00020\u0005\u00a2\u0006\u0002\u0010\bJ\u0014\u0010\t\u001a\b\u0012\u0004\u0012\u00020\u00040\n2\u0006\u0010\u000b\u001a\u00020\u0004R\u000e\u0010\u0007\u001a\u00020\u0005X\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u000e\u0010\u0006\u001a\u00020\u0004X\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u001a\u0010\u0002\u001a\u000e\u0012\u0004\u0012\u00020\u0004\u0012\u0004\u0012\u00020\u00050\u0003X\u0082\u0004\u00a2\u0006\u0002\n\u0000\u00a8\u0006\f"}, d2={"Lai/grazie/nlp/encoder/WordPiece;", "", "vocabulary", "", "", "", "unknownToken", "maxInputCharsPerWord", "(Ljava/util/Map;Ljava/lang/String;I)V", "tokenize", "", "text", "nlp-encoder-engine"})
public final class WordPiece {
    @NotNull
    private final Map<String, Integer> vocabulary;
    @NotNull
    private final String unknownToken;
    private final int maxInputCharsPerWord;

    public WordPiece(@NotNull Map<String, Integer> vocabulary, @NotNull String unknownToken, int maxInputCharsPerWord) {
        Intrinsics.checkNotNullParameter(vocabulary, (String)"vocabulary");
        Intrinsics.checkNotNullParameter((Object)unknownToken, (String)"unknownToken");
        this.vocabulary = vocabulary;
        this.unknownToken = unknownToken;
        this.maxInputCharsPerWord = maxInputCharsPerWord;
    }

    public /* synthetic */ WordPiece(Map map, String string, int n, int n2, DefaultConstructorMarker defaultConstructorMarker) {
        if ((n2 & 4) != 0) {
            n = 100;
        }
        this(map, string, n);
    }

    @NotNull
    public final List<String> tokenize(@NotNull String text2) {
        Intrinsics.checkNotNullParameter((Object)text2, (String)"text");
        ArrayList<String> out = new ArrayList<String>();
        for (String token : UtilsKt.tokenizeByWhitespace((String)text2)) {
            if (token.length() > this.maxInputCharsPerWord) {
                out.add(this.unknownToken);
                continue;
            }
            boolean isUnknown = false;
            int start2 = 0;
            ArrayList<Object> subTokens = new ArrayList<Object>();
            while (start2 < token.length()) {
                int end2;
                Object curSubstr = null;
                for (end2 = token.length(); start2 < end2; --end2) {
                    Object substr = StringsKt.slice((String)token, (IntRange)RangesKt.until((int)start2, (int)end2));
                    if (start2 > 0) {
                        substr = "##" + (String)substr;
                    }
                    if (!this.vocabulary.containsKey(substr)) continue;
                    curSubstr = substr;
                    break;
                }
                if (curSubstr == null) {
                    isUnknown = true;
                    break;
                }
                subTokens.add(curSubstr);
                start2 = end2;
            }
            if (isUnknown) {
                out.add(this.unknownToken);
                continue;
            }
            out.addAll((Collection)subTokens);
        }
        return out;
    }
}

