Skip to content

Commit 525b2ee

Browse files
author
Sam Hokin
committed
2 parents 8c2072a + 3af819d commit 525b2ee

File tree

10 files changed

+28
-17
lines changed

10 files changed

+28
-17
lines changed

chatbot/build.gradle

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,7 @@ dependencies {
5353
implementation group: 'org.eclipse', name: 'yasson', version: '1.0.11'
5454

5555
// https://mvnrepository.com/artifact/io.pinecone/pinecone-client
56-
// implementation group: 'io.pinecone', name: 'pinecone-client', version: '0.2.3'
57-
58-
// https://mvnrepository.com/artifact/com.google.protobuf/protobuf-java
59-
implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.23.3'
60-
// https://mvnrepository.com/artifact/io.grpc/grpc-protobuf
61-
implementation group: 'io.grpc', name: 'grpc-protobuf', version: '1.53.0'
62-
implementation group: 'io.grpc', name: 'grpc-stub', version: '1.53.0'
63-
implementation group: 'io.grpc', name: 'grpc-netty', version: '1.53.0'
64-
65-
56+
implementation group: 'io.pinecone', name: 'pinecone-client', version: '0.2.3'
6657

6758
// https://mvnrepository.com/artifact/org.projectlombok/lombok
6859
compileOnly group: 'org.projectlombok', name: 'lombok', version: '1.18.26'

chatbot/libs/ncgr-pubmed.jar

84 Bytes
Binary file not shown.
-253 KB
Binary file not shown.

chatbot/scripts/curl-fetch.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ IDS=$2
55

66
curl -X GET \
77
-H "Api-Key: $PINECONE_API_KEY" \
8-
"https://$INDEX_NAME-index-$PINECONE_PROJECT_NAME.svc.$PINECONE_ENVIRONMENT.pinecone.io/vectors/fetch?ids=$IDS"
8+
"https://$INDEX_NAME-$PINECONE_PROJECT_NAME.svc.$PINECONE_ENVIRONMENT.pinecone.io/vectors/fetch?ids=$IDS"

chatbot/src/main/java/org/ncgr/chatbot/PubAgEmbeddingsUpserter.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,15 @@ public static void main(String[] args) throws IOException, ParserConfigurationEx
151151
}
152152

153153
// remove abstracts that lack text
154+
int emptyCount = 0;
154155
List<Abstract> all = new ArrayList<>(abstracts); // avoid concurrent mod
155156
for (Abstract a : all) {
156157
if ((a.getText() == null) || (a.getText().length() == 0)) {
157158
abstracts.remove(a);
159+
emptyCount++;
158160
}
159161
}
162+
System.out.println("## Removed " + emptyCount + " empty abstracts.");
160163

161164
// upsert our abstracts
162165
if (abstracts!=null && abstracts.size()>0) {

chatbot/src/main/java/org/ncgr/chatbot/PubMedEmbeddingsUpserter.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,9 @@ public static void main(String[] args) throws IOException, ParserConfigurationEx
135135
if (!existingVectors.containsKey(id)) pmidsToUpsert.add(pmid);
136136
}
137137
if (pmidsToUpsert.size() > 0) {
138-
System.out.println("## Found " + pmidsToUpsert.size() + " new abstracts.");
138+
System.out.println("## " + pmidsToUpsert.size() + " are new PMIDs.");
139139
abstracts = Pubmed.getAbstracts(pmidsToUpsert, apikey);
140+
System.out.println("## " + abstracts.size() + " new abstracts were fetched.");
140141
}
141142
}
142143
} else {
@@ -162,12 +163,15 @@ public static void main(String[] args) throws IOException, ParserConfigurationEx
162163
}
163164

164165
// remove abstracts that lack text
166+
int emptyCount = 0;
165167
List<Abstract> all = new ArrayList<>(abstracts); // avoid concurrent mod
166168
for (Abstract a : all) {
167169
if ((a.getText() == null) || (a.getText().length() == 0)) {
168170
abstracts.remove(a);
171+
emptyCount++;
169172
}
170173
}
174+
System.out.println("## Removed " + emptyCount + " empty abstracts.");
171175

172176
// upsert the abstracts (only new ones if --update given)
173177
if (abstracts!=null && abstracts.size()>0) {

chatbot/update-legumebot-pubag.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,5 @@ for species in "Aeschynomene evenia" \
5050
"Vigna unguiculata"
5151
do
5252
echo $species
53-
scripts/pubag-embeddings-upserter.sh -i legumebot -u -t "$species"
53+
scripts/pubag-embeddings-upserter.sh -i legumebot-index -u -t "$species"
5454
done

chatbot/update-legumebot-pubmed.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,5 @@ for species in "Aeschynomene evenia" \
5050
"Vigna unguiculata"
5151
do
5252
echo $species
53-
scripts/pubmed-embeddings-upserter.sh -u -i legumebot -r 10000 -t "$species"
53+
scripts/pubmed-embeddings-upserter.sh -u -i legumebot-index -r 10000 -t "$species"
5454
done

pubmed/src/main/java/org/ncgr/pubmed/Abstract.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,12 +135,12 @@ public List<String> getKeywords() {
135135
@Override
136136
public String toString() {
137137
StringBuffer sb = new StringBuffer();
138-
if (title.startsWith("Title: ")) {
138+
if (title!=null && title.startsWith("Title: ")) {
139139
sb.append(title + "\n");
140140
} else {
141141
sb.append("Title: " + title + "\n");
142142
}
143-
if (text.startsWith("Abstract: ")) {
143+
if (text!=null && text.startsWith("Abstract: ")) {
144144
sb.append(text + "\n");
145145
} else {
146146
sb.append("Abstract: " + text + "\n");

pubmed/src/main/java/org/ncgr/pubmed/Pubmed.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
*/
5555
public class Pubmed {
5656

57+
final static boolean DEBUG = false;
58+
5759
/**
5860
* Unmarshal an ESearchResult from a given esearch URI.
5961
*
@@ -119,13 +121,24 @@ public static PmcArticleset getPmcArticleset(String uri) throws JAXBException, X
119121
* @return a list of Abstract objects
120122
*/
121123
static List<Abstract> getAbstracts(PubmedArticleSetDocument articleSetDocument) {
124+
int rejectedCount = 0;
122125
List<Abstract> abstractList = new ArrayList<>();
123126
for (PubmedArticleType pubmedArticleType : articleSetDocument.getPubmedArticleSet().getPubmedArticleArray()) {
124127
Abstract a = new Abstract(pubmedArticleType);
125128
if (a.getTitle()!=null && a.getPMID()!=null) {
126129
abstractList.add(a);
127-
}
130+
} else {
131+
rejectedCount++;
132+
if (DEBUG) {
133+
System.out.println("## Pubmed.getAbstracts: rejected abstract:");
134+
System.out.println(a);
135+
}
136+
}
128137
}
138+
if (DEBUG) {
139+
System.out.println("## Pubmed.getAbstracts: " + rejectedCount + " abstracts lacked title or PMID.");
140+
System.out.println("## Pubmed.getAbstracts: " + abstractList.size() + " abstracts were returned.");
141+
}
129142
return abstractList;
130143
}
131144

0 commit comments

Comments
 (0)