I was reading https://developers.google.com/webmasters/ajax-crawling/
on how to make ajax apps (consequently gwt apps) crawlable.
I took the code from google (summarized in point 3 of "How to create
an HTML snapshot?" to create a filter (that returns html from ajax
using HtmlUnit) and changed the web.xml accordingly. I created a new
GWT project with example code and applied the filter and the web.xml
there. It worked directly.
However, I did exactly the same on the gwt app I want to make
searchable and it doesn't work. For some reason, the only requests the
filter gets are the ones to the ones for the rpc.
I think I must be missing a terribly simple detail, but I'm a bit lost
on where to go from here.
Following you can see the code for the filter (CrawlServlet) and the
web.xml
package crawltest.server;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.logging.Logger;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
/**
* Servlet that makes this application crawlable
*/
public final class CrawlServlet implements Filter {
private static final Logger logger =
Logger.getLogger(CrawlServlet.class
.getName());
private static String rewriteQueryString(String queryString) throws
UnsupportedEncodingException {
StringBuilder queryStringSb = new StringBuilder(queryString);
int i = queryStringSb.indexOf("&_escaped_fragment_");
if (i != -1) {
StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
tmpSb.append("#!");
tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 20,
queryStringSb.length()),"UTF-8"));
queryStringSb = tmpSb;
}
i = queryStringSb.indexOf("_escaped_fragment_");
if (i != -1) {
StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
tmpSb.append("#!");
tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 19,
queryStringSb.length()), "UTF-8"));
queryStringSb = tmpSb;
}
if (queryStringSb.indexOf("#!") != 0) {
queryStringSb.insert(0, '?');
}
queryString = queryStringSb.toString();
return queryString;
}
private FilterConfig filterConfig = null;
/**
* Destroys the filter configuration
*/
public void destroy() {
this.filterConfig = null;
}
/**
* Filters all requests and invokes headless browser if necessary
*/
public void doFilter(ServletRequest request, ServletResponse
response,
FilterChain chain) throws IOException {
System.out.println("crawl");
if (filterConfig == null) {
return;
}
System.out.println("crawl");
HttpServletRequest req = (HttpServletRequest) request;
HttpServletResponse res = (HttpServletResponse) response;
String queryString = req.getQueryString();
System.out.println("query:"+queryString);
System.out.println("param:"+req.getParameterMap().toString());
System.out.println("req:"+req);
if ((queryString != null) &&
(queryString.contains("_escaped_fragment_"))) {
System.out.println("in!!");
StringBuilder pageNameSb = new StringBuilder("http://");
pageNameSb.append(req.getServerName());
if (req.getServerPort() != 0) {
pageNameSb.append(":");
pageNameSb.append(req.getServerPort());
}
pageNameSb.append(req.getRequestURI());
queryString = rewriteQueryString(queryString);
pageNameSb.append(queryString);
final WebClient webClient = new
WebClient(BrowserVersion.FIREFOX_3);
webClient.setJavaScriptEnabled(true);
String pageName = pageNameSb.toString();
HtmlPage page = webClient.getPage(pageName);
webClient.waitForBackgroundJavaScriptStartingBefore(2000);
res.setContentType("text/html;charset=UTF-8");
PrintWriter out = res.getWriter();
out.println("<hr>");
out.println("<center><h3>You are viewing a non-interactive page
that is intended for the crawler. You probably want to see this page:
<a href=\""
+ pageName + "\">" + pageName + "</a></h3></center>");
out.println("<hr>");
out.println(page.asXml());
webClient.closeAllWindows();
out.close();
} else {
try {
chain.doFilter(request, response);
} catch (ServletException e) {
e.printStackTrace();
}
}
}
/**
* Initializes the filter configuration
*/
public void init(FilterConfig filterConfig) {
this.filterConfig = filterConfig;
}
}
web-xml:
<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"
version="2.5" xmlns="http://java.sun.com/xml/ns/javaee">
<filter>
<filter-name>CrawlServlet</filter-name>
<filter-class>crawltest.server.CrawlServlet</filter-class>
</filter>
<filter-mapping>
<filter-name>CrawlServlet</filter-name>
<url-pattern>/*</url-pattern>
</filter-mapping>
<!-- Servlets -->
<!-- Default page to serve -->
<welcome-file-list>
<welcome-file>CrawlTest.html</welcome-file>
</welcome-file-list>
</web-app>
--
You received this message because you are subscribed to the Google Groups "Google Web Toolkit" group.
To post to this group, send email to google-web-toolkit@googlegroups.com.
To unsubscribe from this group, send email to google-web-toolkit+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/google-web-toolkit?hl=en.
No comments:
Post a Comment